aboutgitcodebugslistschat
path: root/tcp.c
blob: a1860d10b15f8d32330a2d040d979e5bf9f62b24 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
                                            

                                         



                                         


                                              
                                       
                                              





                           

             






                                                                                
                                                                              



















                                                                                

                                                                             



                                                                                
                                                                    




                                                                                







                                                                                
                                                               


                                                                           
                                                                              
                                                                             
                                                                           

                                                                              





                                                                           
                                                                               
                                                                               

                                                                              



                                                                               
  







                                                                              




                                                                      

                                                                               
                 















                                                                                

                                                                               


                                                     
                    

                    



















                                                                               
  
                                                   
  
                                             
  





                                                                               











                                                                                
  


                    
                                                                           
  


                                                                             

                                                                              


                                                                               
  


                                                                               
  



                                                                              
  





                                                                             











                                                                           
  

                                                                          

                              

                               
                                


                                                               
                                                                                



                                                              




                                                                               
  


                                



                                                                              


                                                                              
                              










                                                                                
                                                      

   

                  
                  
                   
                   
                   




                         
                       
                   
                    


                      
                       
                        
                      
                    
                 
                      
 

                                                
                     
                 
               

                  
                    
                 
                       
                
                  
                 
 
                       
 



                                                         
                                           

                                                                         
 
                                                               
                                                                              
 
                                         
                                                              

                                 
                                           

                                                                             
               
                        


                       
                            













                                                                             
                            









                                                                      
 
                                                                      




                                                               
 
                                                                



                                                               
 
                                         
                                                   
 




                                                                     




                                                                    



                                
                              
                                
                                                                 



                         
                         
                         
                         



                         
                                                            
                                                



                                                                         
 




                                                                           

  










                                                                         
                                                             
                           

  
                                                                              

                                                         
 


                                                                      
                                                        
 









                                                                           



                                                                     
                                                                                
                                                        



                                                                   
                             
               
                                                             
     
                                                                    
      



                                                                     
                                                                        




                                                              
                            
 

                                                                        
                                     


                                                                     
                                                                              
                                                        



                                                                           
                      




                                                                    
                                                                    

                                                                     
                                                                     
                                                                        




                                                              
                            
 

                                                                        
                                     
 
                                      
                                                             
                                                                     
 



                                                                 
 
                         
                                                             
 

                                                                              
                                                                                
                                                        



                                                                   
                                   
               
                                                             
     
                                                                    
      


                                                                     





                                                              
                                  
 
                                           



                                                                              
                                                        



                                                                           
                                   




                                                                               
                                                                               
                                                                               
                                                                               





                                                              
                                  
 
                                           
 
                                                   
 
                                                                   
                                                
 


                                                                                
                                            

                                                     
















                                                                         
                                         
                                                                         









                                                       
                                                                        
                                             

                                                                         
                                                                         









                                                                         
                                                                        
 
                                                               
                                                                         
                                                                        

                                                        
                                     
                                   


                                                                               







                                                                     
                              
 
                                
                                                                       
                                                           
                                                                   






                                                                             



                 





                                                                                
                                                                         


                                                


                                   
                                
                                                                     
                                                         
                                                                 

                                                                        
                       
 
                                                        
                                                  

                                                                 


                                          

                               
                                 

                                                                             

                                                                 





                                           
                                           



                                                                       







                                                                   


                                                                        





                                                                           



                                              
                                                                        


                                            

                                            



                                           

                                                                               
                
                                           
 









                                                                           
                               
                 

                                    

                                                                       

         
                                                
                                       
 


                                                                            
                                       

 


                                                             





                                                                      
                                                                         







                                              
                                       









                                                                              





                                                                            
                                       

                         







                                                                                   
 


                                                                            



                                                 
                                                          
                                       



                                                                         
                                                                         



                                                                         

                                                                        
  

                                                            
                                                           



                                                
                                                                







                                                                            
                                                     
   
                                                              
                                                           
 
                  

                         

                                                         


                                                  
                                                                




                                                                           





                                                                             
                                          

                                       
                                                                  



                        


   


                                                                              
                                                     
 
                                   
                     
                   
 

                                                                 
                                                 


                       
                   




                                                                              
                                          


   


                                                                         
                                                            





                                                                             

                                                                                
 

                                                                                


   
                                                                

                                                  
   
                                                                             
 



                                                                               
 

                                        



                                                            

                                                  
   
                                                                          
 


                                                                          
 

                                               





                                                                           
   
                                                                              


              
                                              

                                                                        


                                                           



                                                            




                                                                               
                                 
   
                                                   
 
                                                        


                          
                                                       
                                                         
                                                       
                                   
                                                                             



                                                             
                                                                     
                                                       
                                                           


                  
                                                                      
                                                                        
 
                                                                            
                                                                              



                                                                               
                                 
   
                                                   



                          

                                                         
                                                         

                                                                             




                                                                     
                                                         
                                                            


                  
                                                                      
                                                                        
 
                                                                            
                                                                              


   
                                                                

                                                                           
                                       

                                                                
  
                                                                      
   

                                                                       
 
                             
 
                          
                          
 

                                                         


                                  
                                   

                              
                                         



                                                                           
                                               

                                 
                                              

                                      


                                                     
                                                  




                                         
                                             
                               
                                                                
                                
                                                                
                         






                  
                                                                           
                                                 


                                             


                                  
                                                          

                                                           
 
                                                
                                                         





                         
                                                                           
                                 


                                             
  
                                                          
   

                                                                            
 
                                                                  

                                          
                                                                        


   



                                                                    
                                                          
   

                                                              
 
                                                                   


   









                                                                             
                                                    
                                                                  

                            

                                                           





                                                       
                                                                    
                                 
                                  
   
                                                                           
 
                                             
 
                                              
                                                                                


   
                                                                    
                                 
                                  
   

                                                            
 
                                                
                                                    
 
                  




                                                                                

                                                    
                                                      
                                                                                





                                                                           


                 
                                    


   
                                                                        
                                 
                                                   


                                                                    
  
                                                          
   
                                                                
                                                                              
                                                                             
 
                              
                         
                   
 
                                        
 
                                                                   

                                                                
                                                       
 
                          


   
                                                                              
                                                    

                                                             
   
                                     
 
                                                     
 
                                       
                             
 
                          

                                   
 
                    

 
                                                                 

                                                                         
                                                                             

                                                                         
 
   

                                                                                
   
                                                       
 
                                                                         
                                   
 
                                                                         
                                   
 
 


                                                                    
   
                                                      
 


                   
                                                                 

                                                                                
                             
 
                                                                 

                                                                                
                             


   


                                                       
                                                                     
                                     
 

                                  


   





































































































                                                                              







                                                                                
                                                        
   
                                                          
                                                                      
                                                           
                                                                          
 
                                                          
                            
 


                                                                    

                                                                          
 
                                                          
                
                                                                    
 

                                                                           
 
                                                          
         
 
                    





                                                                        
                                                                                
                                                                    


                                                            
                                                                                
                                                                       
 
                                                                       
                                                        
                                                                      

                                      
                                                  

                           
                       
                        




                                                          
                                                                              
                                                                             

                                                                


                                                                         


                                         
                                                                




                                                                  
                             
 
                                         
                                     
                                                                   

                                                                         
                         

         
                     
                                                       
                                 
 
                                   
                                                                   
                                 
 

         
                  
                                                             
                                                     

                                     

                                                              
         
      
 




                                                                             
 






                                                                                     

                                                   
 
    
                                                         



                                                       







                                                                                


                                                      
                                                  

                                                                                







                                                             
                                                                
                                 
                                  
                                                                      
  
                                                               
   
                                                                             
 
                                                        
                                                    

                                              

                                      
                           
                          
                          
                          
                   
                
 


                                                               
 
                                                            
                                            
                                   

         




                                                         
                                   
                                                
 
                                                                     
                         
 
                            

                                                                      
                             


                                                                         
                

                                                                      
                             
                                

         
                          
                        
 


                                                                  

                                      

                                   
                                                 
                        
                                                             
                                          


                                                              
 
                                          
                                                                             
                                                          
                                                 
                                                                 
                 
                                                               
 
                                        
 
                                                                     
 


                                     
                                          

                                          
                
                                                        

                                                                    

         
                                              




                                  

                                                                       
 





                                                                     
 

                                                     


                                                                      
                                   








                                                                               
                                                  





                                                          
 
                                                                               
                                                  
         
 



                 
                                                                               


                                  
                                                                
 
                                   

                       
                                         
                                            


   
                                                                             
                                  


                                                                     
                                                     
                                                           
 
                                                               
 




                                        
 
   
                                                                    


                                                  
                                                                          
 

                                                                      
 

                                                                              


   

                                                                           
                                                                     
                                 
   

                                                                        
 
                                                                  
                              

                      
 
                          


                                                             
 



                                                                        
 

                                                            
 
                                                                          


   

                                                                            
  
                                                                     
   
                                  
 
                      
 

                                                  
                           
                                 
         

                  
 






                                                                            
                                                                 



                                                                 
 
                             



                            
                  
                              
 
                                   
 

                 
 
   

























                                                                            
                                                             
                                  

                                                                     


                            
                                                                 
                                                                 



                         
                                                                       



                                  



                                     




                                   






















































                                                                                


                                                                         

                                                                   


                                                                     

                                 

                                                                   

                                                                        
 

                                              

                                      
                                           
                                                     


                                        
                                            
                                                       

                                  
                                  
                         
                        
                     
 
                                   

                       































                                                                              

                                           

                            
                                                                           
                                                                       
                                                                            
                                                           




                                                                        
                                                    
                                                 
                  

                                                                            

         
                                                        
                       
                         
                                          
 
                                          
 
                                                   
                                                                  
                                                                             
                           
 






                                                                              
 
                                               
 
                            
                                               
                                   
                
                                               
                                   

         

                              
 


                                                         
 
                                   
                                                  
 
                                 
 
                               
                                                                      



                                                      
 




                                                                         

                                           
                                         


                               
                                     
                

                                     
                                                      
                               
 
                                                      

         
                               





                                


   
                                                          



                                                                   
   








                                                                                
                                                                              
 





                                                                           
 
                                                                   


                                                                    
 
                 
 
 
   
                                                                              

                                  


                                                                             
   
                                                                           
                                                                    
 
                                                 
                          
 

                                                                         
                                                                             
 


                                                                          
                                                       

                                                                        
                                                                   
                                                 


                                                                         


                                                                          
                                                       

                                                                        
                                                                   
                                                 
         




                                                                              
                                  
  
                                                    

                    
   
                                                                       
 
                                                                      
                                                            
                                                   
                                       
                                      
                                     
                                   
                          


                                                                 
                                      
                                           

                                                                          
                                                          


                                 
                                                        

                                                     
                         
         
 
                                                                            
                                                                 

                                       

                            
                                                            
         
 





                                               
                                                                              
                                                                              
                                         
 



                                                             




                                                                                
                                   
         
                    
                                                      
 
                                                                              




                                                     
                         
 











                                                                                       
 

                                     
                                            
                         

         
                                     
 

                                                   
 
                                                  
                                                
 
                                   
                   
                               
                                         
                                                                          



                                        

                                                             

         
                                             
 
                 


                                                      
                             
                                 
         
 
                   


   
                                                                  

                                  
                                                     
                                                  

                    

                                    
   
                                                                      
                                                            
 
                                                                              
                                                      
                                                      
                                                   
                                                  
                   
                  
 
                                   
                                      


                                           
                                                          
                                                  
                                        
                           

                           
                                                            

                                  
                                   
 
                                     

                                                   

                              
                                                    
                                 


                           


                                                        





                                             
 

                                                                      
                                                      

                                                                
                                                                            

                                                                    
                                                      
                         




                                


                                 






                                                                     
                                                             





                                                                        
                                                              

                                                                             



                                              
                                  
                   
                                                                             

                                 
                                            
                                       



                                         


                                                            

                        
                              
                                  


                                     

         


                                                                                
 
                                                     
 
                   


                                                                              
                                               
                                            

         
                   
                         
 
                              
      







                                                                  
                                                    

                 


                                   
                                                              
                                                              
                                              
 
                 
                          

         
                                                           
                                 
                                        
                                                      

                                        

         

                         





                                                                              
                                                        
                 
                                      
         
 


                                                       
 


                                     
                                                  
                
                                                      
         

                              


   


                                                                          


                                                                             
   
                                                                               

                                                                      
 
                                                       





                                                        
                                                            









                                                                   
                                    
                                    


   

                                                                    
                                                   
                                                   

                                   
                                                     
                                                        
                                 

                                    
   
                                                               
                                                         
                                                                              
 
                                  
                                
                           
                         
                        
                  
 

                  
                                                      

                         
                           

                                              

                                                                           
                                                             
 
                                                                                 

                                     
                    
                                                

                                                                  
                         

         
                                                            
 
                      
                                            
                         

         

                                                                        
 

                                                 
                                                     
                                                                             

                                 
 
                           
         
 

                                              

                                                       
 
                                                 
 
                              

                                             
                                                      
                                                    

                                                           
                                 

                 

                                   
 
                                                               

                                            
 
                                        

                                 
 

                                                                 

                                                                        

                                                               
                                                    
 



                                                             
                                                   


                           

                                     

                                                       



                                                                               

                                            

         


                                                   
                     







                                                                               




                                                                            
                                  
   
                                                                        
 



                        

                                                                           


                       
                                              

                       
                                              
                                             


   



























                                                                           
                                                                         
                                 
                                                           
                                  

                                                   

                                 
                                                                    
                                                           
                                                                  
                                                              
 

                                                                              
                       
                         
                                                
                                           
 
                                                            
                                                             
 
                                          
 
                                   

                                 
                                                  
 
                                            
 

                                             
 
                             


   
                                                                             



                                                   

                                                           
 
                                
                                  
                         

              
                                                

                       
                                                        
                  
                            
 


























                                                                               

                                                                         

                       
                                                                          



                                


   





                                                                              
                                                          
 
                                                         
                                                   
 
                      











                                                                              
                                       

                                                    
                                                            
                                         
                                                                           
                                                      

                                                              
                                                                         

                                         
                                                             


                                                                  
                                               












                                                                                
                                                           





                                         
                                                               
                                 
                               

                                   
                                                                          
 




                                                            


                                   
                                
                                 


                       
                                                                   
                                            
                       
         
 

                                                                  
                                                    
 

                                                           
 
                                     
                                                    
 

                                                                
 
                       

         


                                              
                       
         
 

                                                         
                       
 

                                           
                                                    

                                       


   
                                                                           
                                 

                                           
                                                                      
                                                                    
  
                                                                          
   
                                                                                
                                                                 
 
                                           
                             
                                
          

              
                                                                      
 

                                                      
                                                                     
                                                      
                                                                     
         
 
                  
                         
 

                                   


   
                                                                               
                                 



                                                                            
  
                                                                            
   

                                                                        
 
                                                     
 


                                                                            
                                 

                                                      

                                                                      
 

                                                                       
 
                                                                             


                                








                                                                            
                                           
                             
                                  
          

              
                                      
 

                                                                           















                                                                            
                                           
                             
                                  
          

              
                                      









                                                                            


   













                                                                              

                                 
            
   
                                       
 
                                                      
                      
 
                    
 
                                                  
                                                            
                                 
 
                                          





                 



                                                                 

                                                                            
   
                                                                         



                                                  

                       
                                 
                                 
 



                                                        
         

                 


   
                                                                         
                                 
   

                                                     











                                                                              
 

   
                                                                             

                                 
                                       
   
                           
 




                                                 
                    
                                      
 
                    
                                      
 

                                                                         


                                                                           
                                

                                    

                                   
                                              

         



                 
                                                         




                                                                  
                                                         



































                                                                                



                                                         


            
                                              
 
                                          
 

                                 




                 
                                                                                
                                 
                                 
   
                                                         
 
                  
 
                                    
                                                      
                                                                            
                                                             
                 
 
                                                     
                                                                            
                                                  
                 
         
 
                                

                                     
 
// SPDX-License-Identifier: GPL-2.0-or-later

/* PASST - Plug A Simple Socket Transport
 *  for qemu/UNIX domain socket mode
 *
 * PASTA - Pack A Subtle Tap Abstraction
 *  for network namespace/tap device mode
 *
 * tcp.c - TCP L2-L4 translation state machine
 *
 * Copyright (c) 2020-2022 Red Hat GmbH
 * Author: Stefano Brivio <sbrivio@redhat.com>
 */

/**
 * DOC: Theory of Operation
 *
 *
 * PASST mode
 * ==========
 *
 * This implementation maps TCP traffic between a single L2 interface (tap) and
 * native TCP (L4) sockets, mimicking and reproducing as closely as possible the
 * inferred behaviour of applications running on a guest, connected via said L2
 * interface. Four connection flows are supported:
 * - from the local host to the guest behind the tap interface:
 *   - this is the main use case for proxies in service meshes
 *   - we bind to configured local ports, and relay traffic between L4 sockets
 *     with local endpoints and the L2 interface
 * - from remote hosts to the guest behind the tap interface:
 *   - this might be needed for services that need to be addressed directly,
 *     and typically configured with special port forwarding rules (which are
 *     not needed here)
 *   - we also relay traffic between L4 sockets with remote endpoints and the L2
 *     interface
 * - from the guest to the local host:
 *   - this is not observed in practice, but implemented for completeness and
 *     transparency
 * - from the guest to external hosts:
 *   - this might be needed for applications running on the guest that need to
 *     directly access internet services (e.g. NTP)
 *
 * Relevant goals are:
 * - transparency: sockets need to behave as if guest applications were running
 *   directly on the host. This is achieved by:
 *   - avoiding port and address translations whenever possible
 *   - mirroring TCP dynamics by observation of socket parameters (TCP_INFO
 *     socket option) and TCP headers of packets coming from the tap interface,
 *     reapplying those parameters in both flow directions (including TCP_MSS
 *     socket option)
 * - simplicity: only a small subset of TCP logic is implemented here and
 *   delegated as much as possible to the TCP implementations of guest and host
 *   kernel. This is achieved by:
 *   - avoiding a complete TCP stack reimplementation, with a modified TCP state
 *     machine focused on the translation of observed events instead
 *   - mirroring TCP dynamics as described above and hence avoiding the need for
 *     segmentation, explicit queueing, and reassembly of segments
 * - security:
 *   - no dynamic memory allocation is performed
 *   - TODO: synflood protection
 *
 * Portability is limited by usage of Linux-specific socket options.
 *
 *
 * Limits
 * ------
 *
 * To avoid the need for dynamic memory allocation, a maximum, reasonable amount
 * of connections is defined by TCP_MAX_CONNS (currently 128k).
 *
 * Data needs to linger on sockets as long as it's not acknowledged by the
 * guest, and is read using MSG_PEEK into preallocated static buffers sized
 * to the maximum supported window, 16 MiB ("discard" buffer, for already-sent
 * data) plus a number of maximum-MSS-sized buffers. This imposes a practical
 * limitation on window scaling, that is, the maximum factor is 256. Larger
 * factors will be accepted, but resulting, larger values are never advertised
 * to the other side, and not used while queueing data.
 *
 *
 * Ports
 * -----
 *
 * To avoid the need for ad-hoc configuration of port forwarding or allowed
 * ports, listening sockets can be opened and bound to all unbound ports on the
 * host, as far as process capabilities allow. This service needs to be started
 * after any application proxy that needs to bind to local ports. Mapped ports
 * can also be configured explicitly.
 *
 * No port translation is needed for connections initiated remotely or by the
 * local host: source port from socket is reused while establishing connections
 * to the guest.
 *
 * For connections initiated by the guest, it's not possible to force the same
 * source port as connections are established by the host kernel: that's the
 * only port translation needed.
 *
 *
 * Connection tracking and storage
 * -------------------------------
 *
 * Connections are tracked by struct tcp_tap_conn entries in the @tc
 * array, containing addresses, ports, TCP states and parameters. This
 * is statically allocated and indexed by an arbitrary connection
 * number. The array is compacted whenever a connection is closed, by
 * remapping the highest connection index in use to the one freed up.
 *
 * References used for the epoll interface report the connection index used for
 * the @tc array.
 *
 * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for
 * separate data structures depending on the protocol version.
 *
 * - Inbound connection requests (to the guest) are mapped using the triple
 *   < source IP address, source port, destination port >
 * - Outbound connection requests (from the guest) are mapped using the triple
 *   < destination IP address, destination port, source port >
 *   where the source port is the one used by the guest, not the one used by the
 *   corresponding host socket
 *
 *
 * Initialisation
 * --------------
 *
 * Up to 2^15 + 2^14 listening sockets (excluding ephemeral ports, repeated for
 * IPv4 and IPv6) can be opened and bound to wildcard addresses. Some will fail
 * to bind (for low ports, or ports already bound, e.g. by a proxy). These are
 * added to the epoll list, with no separate storage.
 *
 *
 * Events and states
 * -----------------
 *
 * Instead of tracking connection states using a state machine, connection
 * events are used to determine state and actions for a given connection. This
 * makes the implementation simpler as most of the relevant tasks deal with
 * reactions to events, rather than state-associated actions. For user
 * convenience, approximate states are mapped in logs from events by
 * @tcp_state_str.
 *
 * The events are:
 *
 * - SOCK_ACCEPTED	connection accepted from socket, SYN sent to tap/guest
 *
 * - TAP_SYN_RCVD	tap/guest initiated connection, SYN received
 *
 * - TAP_SYN_ACK_SENT	SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only
 *
 * - ESTABLISHED	connection established, the following events are valid:
 *
 * - SOCK_FIN_RCVD	FIN (EPOLLRDHUP) received from socket
 *
 * - SOCK_FIN_SENT	FIN (write shutdown) sent to socket
 *
 * - TAP_FIN_RCVD	FIN received from tap/guest
 *
 * - TAP_FIN_SENT	FIN sent to tap/guest
 *
 * - TAP_FIN_ACKED	ACK to FIN seen from tap/guest
 *
 * Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD,
 * ESTABLISHED) clears all the other events, as those represent the fundamental
 * connection states. No events (events == CLOSED) means the connection is
 * closed.
 *
 * Connection setup
 * ----------------
 *
 * - inbound connection (from socket to guest): on accept() from listening
 *   socket, the new socket is mapped in connection tracking table, and
 *   three-way handshake initiated towards the guest, advertising MSS and window
 *   size and scaling from socket parameters
 * - outbound connection (from guest to socket): on SYN segment from guest, a
 *   new socket is created and mapped in connection tracking table, setting
 *   MSS and window clamping from header and option of the observed SYN segment
 *
 *
 * Aging and timeout
 * -----------------
 *
 * Timeouts are implemented by means of timerfd timers, set based on flags:
 *
 * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag
 *   ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the
 *   connection
 *
 * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending
 *   data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the
 *   socket and reset sequence to what was acknowledged. If this persists for
 *   more than TCP_MAX_RETRANS times in a row, reset the connection
 *
 * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE
 *   with TAP_FIN_SENT event), and no ACK is received within this time, reset
 *   the connection
 *
 * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN
 *   segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and
 *   TAP_FIN_ACKED), but no socket activity is detected from the socket within
 *   this time, reset the connection
 *
 * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on
 *   either side, the connection is reset
 *
 * - ACK_INTERVAL elapsed after data segment received from tap without having
 *   sent an ACK segment, or zero-sized window advertised to tap/guest (flag
 *   ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent
 *
 *
 * Summary of data flows (with ESTABLISHED event)
 * ----------------------------------------------
 *
 * @seq_to_tap:		next sequence for packets to tap/guest
 * @seq_ack_from_tap:	last ACK number received from tap/guest
 * @seq_from_tap:	next sequence for packets from tap/guest (expected)
 * @seq_ack_to_tap:	last ACK number sent to tap/guest
 *
 * @seq_init_from_tap:	initial sequence number from tap/guest
 * @seq_init_to_tap:	initial sequence number from tap/guest
 *
 * @wnd_from_tap:	last window size received from tap, never scaled
 * @wnd_from_tap:	last window size advertised from tap, never scaled
 *
 * - from socket to tap/guest:
 *   - on new data from socket:
 *     - peek into buffer
 *     - send data to tap/guest:
 *       - starting at offset (@seq_to_tap - @seq_ack_from_tap)
 *       - in MSS-sized segments
 *       - increasing @seq_to_tap at each segment
 *       - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap)
 *     - on read error, send RST to tap/guest, close socket
 *     - on zero read, send FIN to tap/guest, set TAP_FIN_SENT
 *   - on ACK from tap/guest:
 *     - set @ts_ack_from_tap
 *     - check if it's the second duplicated ACK
 *     - consume buffer by difference between new ack_seq and @seq_ack_from_tap
 *     - update @seq_ack_from_tap from ack_seq in header
 *     - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and
 *       resend with steps listed above
 *
 * - from tap/guest to socket:
 *   - on packet from tap/guest:
 *     - set @ts_tap_act
 *     - check seq from header against @seq_from_tap, if data is missing, send
 *       two ACKs with number @seq_ack_to_tap, discard packet
 *     - otherwise queue data to socket, set @seq_from_tap to seq from header
 *       plus payload length
 *     - in ESTABLISHED state, send ACK to tap as soon as we queue to the
 *       socket. In other states, query socket for TCP_INFO, set
 *       @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and
 *       send ACK to tap/guest
 *
 *
 * PASTA mode
 * ==========
 *
 * For traffic directed to TCP ports configured for mapping to the tuntap device
 * in the namespace, and for non-local traffic coming from the tuntap device,
 * the implementation is identical as the PASST mode described in the previous
 * section.
 *
 * For local traffic directed to TCP ports configured for direct mapping between
 * namespaces, see the implementation in tcp_splice.c.
 */

#include <sched.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
#include <errno.h>
#include <limits.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/socket.h>
#include <sys/timerfd.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <time.h>
#include <arpa/inet.h>

#include <linux/tcp.h> /* For struct tcp_info */

#include "checksum.h"
#include "util.h"
#include "ip.h"
#include "passt.h"
#include "tap.h"
#include "siphash.h"
#include "pcap.h"
#include "tcp_splice.h"
#include "log.h"
#include "inany.h"
#include "flow.h"

#include "flow_table.h"

/* Sides of a flow as we use them in "tap" connections */
#define	SOCKSIDE	0
#define	TAPSIDE		1

#define TCP_FRAMES_MEM			128
#define TCP_FRAMES							\
	(c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)

#define TCP_HASH_TABLE_LOAD		70		/* % */
#define TCP_HASH_TABLE_SIZE		(FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)

#define MAX_WS				8
#define MAX_WINDOW			(1 << (16 + (MAX_WS)))

/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT			536

struct tcp4_l2_head {	/* For MSS4 macro: keep in sync with tcp4_l2_buf_t */
#ifdef __AVX2__
	uint8_t pad[26];
#else
	uint8_t pad[2];
#endif
	struct tap_hdr taph;
	struct iphdr iph;
	struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
#ifdef __AVX2__
	uint8_t pad[14];
#else
	uint8_t pad[2];
#endif
	struct tap_hdr taph;
	struct ipv6hdr ip6h;
	struct tcphdr th;
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)));
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
#endif

#define MSS4	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4)
#define MSS6	ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4)

#define WINDOW_DEFAULT			14600		/* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c)	(c->tcp.kernel_snd_wnd)
#else
# define KERNEL_REPORTS_SND_WND(c)	(0 && (c))
#endif

#define ACK_INTERVAL			10		/* ms */
#define SYN_TIMEOUT			10		/* s */
#define ACK_TIMEOUT			2
#define FIN_TIMEOUT			60
#define ACT_TIMEOUT			7200

#define LOW_RTT_TABLE_SIZE		8
#define LOW_RTT_THRESHOLD		10 /* us */

/* We need to include <linux/tcp.h> for tcpi_bytes_acked, instead of
 * <netinet/tcp.h>, but that doesn't include a definition for SOL_TCP
 */
#define SOL_TCP				IPPROTO_TCP

#define SEQ_LE(a, b)			((b) - (a) < MAX_WINDOW)
#define SEQ_LT(a, b)			((b) - (a) - 1 < MAX_WINDOW)
#define SEQ_GE(a, b)			((a) - (b) < MAX_WINDOW)
#define SEQ_GT(a, b)			((a) - (b) - 1 < MAX_WINDOW)

#define FIN		(1 << 0)
#define SYN		(1 << 1)
#define RST		(1 << 2)
#define ACK		(1 << 4)
/* Flags for internal usage */
#define DUP_ACK		(1 << 5)
#define ACK_IF_NEEDED	0		/* See tcp_send_flag() */

#define OPT_EOL		0
#define OPT_NOP		1
#define OPT_MSS		2
#define OPT_MSS_LEN	4
#define OPT_WS		3
#define OPT_WS_LEN	3
#define OPT_SACKP	4
#define OPT_SACK	5
#define OPT_TS		8

#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
#define CONN_V6(conn)		(!CONN_V4(conn))
#define CONN_IS_CLOSING(conn)						\
	((conn->events & ESTABLISHED) &&				\
	 (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set)	((conn->events & (set)) == (set))

static const char *tcp_event_str[] __attribute((__unused__)) = {
	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",

	"SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT",
	"TAP_FIN_ACKED",
};

static const char *tcp_state_str[] __attribute((__unused__)) = {
	"SYN_RCVD", "SYN_SENT", "ESTABLISHED",
	"SYN_RCVD",	/* approximately maps to TAP_SYN_ACK_SENT */

	/* Passive close: */
	"CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK",
	/* Active close (+5): */
	"CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT",
};

static const char *tcp_flag_str[] __attribute((__unused__)) = {
	"STALLED", "LOCAL", "ACTIVE_CLOSE", "ACK_TO_TAP_DUE",
	"ACK_FROM_TAP_DUE",
};

/* Listening sockets, used for automatic port forwarding in pasta mode only */
static int tcp_sock_init_ext	[NUM_PORTS][IP_VERSIONS];
static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];

/* Table of guest side forwarding addresses with very low RTT (assumed
 * to be local to the host), LRU
 */
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];

/**
 * tcp_buf_seq_update - Sequences to update with length of frames once sent
 * @seq:	Pointer to sequence number sent to tap-side, to be updated
 * @len:	TCP payload length
 */
struct tcp_buf_seq_update {
	uint32_t *seq;
	uint16_t len;
};

/* Static buffers */

/**
 * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @taph:	Tap-level headers (partially pre-filled)
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @uh:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
static struct tcp4_l2_buf_t {
#ifdef __AVX2__
	uint8_t pad[26];	/* 0, align th to 32 bytes */
#else
	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
#endif
	struct tap_hdr taph;	/* 26				2 */
	struct iphdr iph;	/* 44				20 */
	struct tcphdr th;	/* 64				40 */
	uint8_t data[MSS4];	/* 84				60 */
				/* 65536			65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_buf[TCP_FRAMES_MEM];

static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];

static unsigned int tcp4_l2_buf_used;

/**
 * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @taph:	Tap-level headers (partially pre-filled)
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @data:	Storage for TCP payload
 */
struct tcp6_l2_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes	0 */
#endif
	struct tap_hdr taph;	/* 14				2 */
	struct ipv6hdr ip6h;	/* 32				20 */
	struct tcphdr th;	/* 72				60 */
	uint8_t data[MSS6];	/* 92				80 */
				/* 65536			65532 */
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_buf[TCP_FRAMES_MEM];

static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];

static unsigned int tcp6_l2_buf_used;

/* recvmsg()/sendmsg() data for tap */
static char 		tcp_buf_discard		[MAX_WINDOW];
static struct iovec	iov_sock		[TCP_FRAMES_MEM + 1];

static struct iovec	tcp4_l2_iov		[TCP_FRAMES_MEM];
static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM];
static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM];
static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM];

/* sendmsg() to socket */
static struct iovec	tcp_iov			[UIO_MAXIOV];

/**
 * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags)
 * @pad:	Align TCP header to 32 bytes, for AVX2 checksum calculation only
 * @taph:	Tap-level headers (partially pre-filled)
 * @iph:	Pre-filled IP header (except for tot_len and saddr)
 * @th:		Headroom for TCP header
 * @opts:	Headroom for TCP options
 */
static struct tcp4_l2_flags_buf_t {
#ifdef __AVX2__
	uint8_t pad[26];	/* 0, align th to 32 bytes */
#else
	uint8_t pad[2];		/*	align iph to 4 bytes	0 */
#endif
	struct tap_hdr taph;	/* 26				2 */
	struct iphdr iph;	/* 44				20 */
	struct tcphdr th;	/* 64				40 */
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp4_l2_flags_buf[TCP_FRAMES_MEM];

static unsigned int tcp4_l2_flags_buf_used;

/**
 * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags)
 * @pad:	Align IPv6 header for checksum calculation to 32B (AVX2) or 4B
 * @taph:	Tap-level headers (partially pre-filled)
 * @ip6h:	Pre-filled IP header (except for payload_len and addresses)
 * @th:		Headroom for TCP header
 * @opts:	Headroom for TCP options
 */
static struct tcp6_l2_flags_buf_t {
#ifdef __AVX2__
	uint8_t pad[14];	/* 0	align ip6h to 32 bytes */
#else
	uint8_t pad[2];		/*	align ip6h to 4 bytes		   0 */
#endif
	struct tap_hdr taph;	/* 14					   2 */
	struct ipv6hdr ip6h;	/* 32					  20 */
	struct tcphdr th	/* 72 */ __attribute__ ((aligned(4))); /* 60 */
	char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
#ifdef __AVX2__
} __attribute__ ((packed, aligned(32)))
#else
} __attribute__ ((packed, aligned(__alignof__(unsigned int))))
#endif
tcp6_l2_flags_buf[TCP_FRAMES_MEM];

static unsigned int tcp6_l2_flags_buf_used;

#define CONN(idx)		(&(FLOW(idx)->tcp))

/* Table for lookup from remote address, local port, remote port */
static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];

static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
	"Safe linear probing requires hash table larger than connection table");

/* Pools for pre-opened sockets (in init) */
int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
int init_sock_pool6		[TCP_SOCK_POOL_SIZE];

/**
 * tcp_conn_epoll_events() - epoll events mask for given connection state
 * @events:	Current connection events
 * @conn_flags	Connection flags
 *
 * Return: epoll events mask corresponding to implied connection state
 */
static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
{
	if (!events)
		return 0;

	if (events & ESTABLISHED) {
		if (events & TAP_FIN_SENT)
			return EPOLLET;

		if (conn_flags & STALLED)
			return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;

		return EPOLLIN | EPOLLRDHUP;
	}

	if (events == TAP_SYN_RCVD)
		return EPOLLOUT | EPOLLET | EPOLLRDHUP;

	return EPOLLRDHUP;
}

static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
			 unsigned long flag);
#define conn_flag(c, conn, flag)					\
	do {								\
		flow_trace(conn, "flag at %s:%i", __func__, __LINE__);	\
		conn_flag_do(c, conn, flag);				\
	} while (0)

/**
 * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * Return: 0 on success, negative error code on failure (not on deletion)
 */
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
	int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
	union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
				.flowside = FLOW_SIDX(conn, SOCKSIDE) };
	struct epoll_event ev = { .data.u64 = ref.u64 };

	if (conn->events == CLOSED) {
		if (conn->in_epoll)
			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
		if (conn->timer != -1)
			epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
		return 0;
	}

	ev.events = tcp_conn_epoll_events(conn->events, conn->flags);

	if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
		return -errno;

	conn->in_epoll = true;

	if (conn->timer != -1) {
		union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
					  .fd = conn->sock,
					  .flow = FLOW_IDX(conn) };
		struct epoll_event ev_t = { .data.u64 = ref_t.u64,
					    .events = EPOLLIN | EPOLLET };

		if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
			return -errno;
	}

	return 0;
}

/**
 * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
 * @c:		Execution context
 * @conn:	Connection pointer
 *
 * #syscalls timerfd_create timerfd_settime
 */
static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
	struct itimerspec it = { { 0 }, { 0 } };

	if (conn->events == CLOSED)
		return;

	if (conn->timer == -1) {
		union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER,
					.fd = conn->sock,
					.flow = FLOW_IDX(conn) };
		struct epoll_event ev = { .data.u64 = ref.u64,
					  .events = EPOLLIN | EPOLLET };
		int fd;

		fd = timerfd_create(CLOCK_MONOTONIC, 0);
		if (fd == -1 || fd > FD_REF_MAX) {
			flow_dbg(conn, "failed to get timer: %s",
				 strerror(errno));
			if (fd > -1)
				close(fd);
			conn->timer = -1;
			return;
		}
		conn->timer = fd;

		if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
			flow_dbg(conn, "failed to add timer: %s",
				 strerror(errno));
			close(conn->timer);
			conn->timer = -1;
			return;
		}
	}

	if (conn->flags & ACK_TO_TAP_DUE) {
		it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000;
	} else if (conn->flags & ACK_FROM_TAP_DUE) {
		if (!(conn->events & ESTABLISHED))
			it.it_value.tv_sec = SYN_TIMEOUT;
		else
			it.it_value.tv_sec = ACK_TIMEOUT;
	} else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) {
		it.it_value.tv_sec = FIN_TIMEOUT;
	} else {
		it.it_value.tv_sec = ACT_TIMEOUT;
	}

	flow_dbg(conn, "timer expires in %llu.%03llus",
		 (unsigned long long)it.it_value.tv_sec,
		 (unsigned long long)it.it_value.tv_nsec / 1000 / 1000);

	timerfd_settime(conn->timer, 0, &it, NULL);
}

/**
 * conn_flag_do() - Set/unset given flag, log, update epoll on STALLED flag
 * @c:		Execution context
 * @conn:	Connection pointer
 * @flag:	Flag to set, or ~flag to unset
 */
static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
			 unsigned long flag)
{
	if (flag & (flag - 1)) {
		int flag_index = fls(~flag);

		if (!(conn->flags & ~flag))
			return;

		conn->flags &= flag;
		if (flag_index >= 0)
			flow_dbg(conn, "%s dropped", tcp_flag_str[flag_index]);
	} else {
		int flag_index = fls(flag);

		if (conn->flags & flag) {
			/* Special case: setting ACK_FROM_TAP_DUE on a
			 * connection where it's already set is used to
			 * re-schedule the existing timer.
			 * TODO: define clearer semantics for timer-related
			 * flags and factor this into the logic below.
			 */
			if (flag == ACK_FROM_TAP_DUE)
				tcp_timer_ctl(c, conn);

			return;
		}

		conn->flags |= flag;
		if (flag_index >= 0)
			flow_dbg(conn, "%s", tcp_flag_str[flag_index]);
	}

	if (flag == STALLED || flag == ~STALLED)
		tcp_epoll_ctl(c, conn);

	if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE		  ||
	    (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
	    (flag == ~ACK_TO_TAP_DUE   && (conn->flags & ACK_FROM_TAP_DUE)))
		tcp_timer_ctl(c, conn);
}

static void tcp_hash_remove(const struct ctx *c,
			    const struct tcp_tap_conn *conn);

/**
 * conn_event_do() - Set and log connection events, update epoll state
 * @c:		Execution context
 * @conn:	Connection pointer
 * @event:	Connection event
 */
static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
			  unsigned long event)
{
	int prev, new, num = fls(event);

	if (conn->events & event)
		return;

	prev = fls(conn->events);
	if (conn->flags & ACTIVE_CLOSE)
		prev += 5;

	if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED))
		prev++;		/* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */

	if (event == CLOSED || (event & CONN_STATE_BITS))
		conn->events = event;
	else
		conn->events |= event;

	new = fls(conn->events);

	if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) {
		num++;
		new++;
	}
	if (conn->flags & ACTIVE_CLOSE)
		new += 5;

	if (prev != new)
		flow_dbg(conn, "%s: %s -> %s",
			 num == -1 	       ? "CLOSED" : tcp_event_str[num],
			 prev == -1	       ? "CLOSED" : tcp_state_str[prev],
			 (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]);
	else
		flow_dbg(conn, "%s",
			 num == -1 	       ? "CLOSED" : tcp_event_str[num]);

	if (event == CLOSED)
		tcp_hash_remove(c, conn);
	else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
		conn_flag(c, conn, ACTIVE_CLOSE);
	else
		tcp_epoll_ctl(c, conn);

	if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
		tcp_timer_ctl(c, conn);
}

#define conn_event(c, conn, event)					\
	do {								\
		flow_trace(conn, "event at %s:%i", __func__, __LINE__);	\
		conn_event_do(c, conn, event);				\
	} while (0)

/**
 * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
 * @conn:	Connection pointer
 *
 * Return: 1 if destination is in low RTT table, 0 otherwise
 */
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
{
	int i;

	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
		if (inany_equals(&conn->faddr, low_rtt_dst + i))
			return 1;

	return 0;
}

/**
 * tcp_rtt_dst_check() - Check tcpi_min_rtt, insert endpoint in table if low
 * @conn:	Connection pointer
 * @tinfo:	Pointer to struct tcp_info for socket
 */
static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
			      const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
	int i, hole = -1;

	if (!tinfo->tcpi_min_rtt ||
	    (int)tinfo->tcpi_min_rtt >