aboutgitcodebugslistschat
path: root/tcp.h
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2021-07-17 08:34:53 +0200
committerStefano Brivio <sbrivio@redhat.com>2021-07-17 11:04:22 +0200
commit33482d5bf29312464b208beb01a5302257e82fe6 (patch)
tree6fcb11961ecca0cbed42bccbba15b1d4fe73a62c /tcp.h
parent28fca04eb990f11608187252ca8949d7df22ce9d (diff)
downloadpasst-33482d5bf29312464b208beb01a5302257e82fe6.tar
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.gz
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.bz2
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.lz
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.xz
passt-33482d5bf29312464b208beb01a5302257e82fe6.tar.zst
passt-33482d5bf29312464b208beb01a5302257e82fe6.zip
passt: Add PASTA mode, major rework
PASTA (Pack A Subtle Tap Abstraction) provides quasi-native host connectivity to an otherwise disconnected, unprivileged network and user namespace, similarly to slirp4netns. Given that the implementation is largely overlapping with PASST, no separate binary is built: 'pasta' (and 'passt4netns' for clarity) both link to 'passt', and the mode of operation is selected depending on how the binary is invoked. Usage example: $ unshare -rUn # echo $$ 1871759 $ ./pasta 1871759 # From another terminal # udhcpc -i pasta0 2>/dev/null # ping -c1 pasta.pizza PING pasta.pizza (64.190.62.111) 56(84) bytes of data. 64 bytes from 64.190.62.111 (64.190.62.111): icmp_seq=1 ttl=255 time=34.6 ms --- pasta.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 34.575/34.575/34.575/0.000 ms # ping -c1 spaghetti.pizza PING spaghetti.pizza(2606:4700:3034::6815:147a (2606:4700:3034::6815:147a)) 56 data bytes 64 bytes from 2606:4700:3034::6815:147a (2606:4700:3034::6815:147a): icmp_seq=1 ttl=255 time=29.0 ms --- spaghetti.pizza ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 28.967/28.967/28.967/0.000 ms This entails a major rework, especially with regard to the storage of tracked connections and to the semantics of epoll(7) references. Indexing TCP and UDP bindings merely by socket proved to be inflexible and unsuitable to handle different connection flows: pasta also provides Layer-2 to Layer-2 socket mapping between init and a separate namespace for local connections, using a pair of splice() system calls for TCP, and a recvmmsg()/sendmmsg() pair for UDP local bindings. For instance, building on the previous example: # ip link set dev lo up # iperf3 -s $ iperf3 -c ::1 -Z -w 32M -l 1024k -P2 | tail -n4 [SUM] 0.00-10.00 sec 52.3 GBytes 44.9 Gbits/sec 283 sender [SUM] 0.00-10.43 sec 52.3 GBytes 43.1 Gbits/sec receiver iperf Done. epoll(7) references now include a generic part in order to demultiplex data to the relevant protocol handler, using 24 bits for the socket number, and an opaque portion reserved for usage by the single protocol handlers, in order to track sockets back to corresponding connections and bindings. A number of fixes pertaining to TCP state machine and congestion window handling are also included here. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'tcp.h')
-rw-r--r--tcp.h45
1 files changed, 32 insertions, 13 deletions
diff --git a/tcp.h b/tcp.h
index 7435c41..6a9aa4a 100644
--- a/tcp.h
+++ b/tcp.h
@@ -3,9 +3,12 @@
#define TCP_TIMER_INTERVAL 20 /* ms */
+#define TCP_MAX_CONNS (128 * 1024)
+#define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2)
+
struct ctx;
-void tcp_sock_handler(struct ctx *c, int s, uint32_t events, char *pkt_buf,
+void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
struct timespec *now);
int tcp_tap_handler(struct ctx *c, int af, void *addr,
struct tap_msg *msg, int count, struct timespec *now);
@@ -13,24 +16,40 @@ int tcp_sock_init(struct ctx *c);
void tcp_timer(struct ctx *c, struct timespec *ts);
/**
+ * union tcp_epoll_ref - epoll reference portion for TCP connections
+ * @listen: Set if this file descriptor is a listening socket
+ * @splice: Set if descriptor is associated to a spliced connection
+ * @v6: Set for IPv6 sockets or connections
+ * @index: Index of connection in table, or port for bound sockets
+ * @u32: Opaque u32 value of reference
+ */
+union tcp_epoll_ref {
+ struct {
+ uint32_t listen:1,
+ splice:1,
+ v6:1,
+ index:20;
+ };
+ uint32_t u32;
+};
+
+/**
* struct tcp_ctx - Execution context for TCP routines
* @hash_secret: 128-bit secret for hash functions, ISN and hash table
- * @fd_min: Lowest file descriptor number for TCP ever used
- * @fd_max: Highest file descriptor number for TCP ever used
- * @fd_listen_min: Lowest file descriptor number for listening sockets
- * @fd_listen_max: Highest file descriptor number for listening sockets
- * @fd_conn_min: Lowest file descriptor number for connected sockets
- * @fd_conn_max: Highest file descriptor number for connected sockets
+ * @tap_conn_count: Count of tap connections in connection table
+ * @splice_conn_count: Count of spliced connections in connection table
+ * @port_to_tap: Ports bound host/init-side, packets to guest/tap
+ * @port_to_init: Ports bound namespace-side, spliced to init
+ * @port_to_ns: Ports bound init-side, spliced to namespace
* @timer_run: Timestamp of most recent timer run
*/
struct tcp_ctx {
uint64_t hash_secret[2];
- int fd_min;
- int fd_max;
- int fd_listen_min;
- int fd_listen_max;
- int fd_conn_min;
- int fd_conn_max;
+ int tap_conn_count;
+ int splice_conn_count;
+ uint8_t port_to_tap [USHRT_MAX / 8];
+ uint8_t port_to_init [USHRT_MAX / 8];
+ uint8_t port_to_ns [USHRT_MAX / 8];
struct timespec timer_run;
};