aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2026-01-30 16:58:11 +1100
committerStefano Brivio <sbrivio@redhat.com>2026-01-31 04:25:08 +0100
commitaf7b81b5408da8c56bb22dd11679f2b4024a45c8 (patch)
treebb3d9145a1a1cb37e4f36cebe68386e282d97059
parent768baf42f1485a7f7c867c8eb3a7e5ddcddb9d86 (diff)
downloadpasst-af7b81b5408da8c56bb22dd11679f2b4024a45c8.tar
passt-af7b81b5408da8c56bb22dd11679f2b4024a45c8.tar.gz
passt-af7b81b5408da8c56bb22dd11679f2b4024a45c8.tar.bz2
passt-af7b81b5408da8c56bb22dd11679f2b4024a45c8.tar.lz
passt-af7b81b5408da8c56bb22dd11679f2b4024a45c8.tar.xz
passt-af7b81b5408da8c56bb22dd11679f2b4024a45c8.tar.zst
passt-af7b81b5408da8c56bb22dd11679f2b4024a45c8.zip
migrate: Use forward table information to close() listening socketsHEADmaster
On incoming migrations we need to bind() reconstructed sockets to their correct local address. We can't do this if the origin passt instance is in the same namespace and still has those addresses bound. Arguably that's a bug in bind()s operation during repair mode, but for now we have to work around it. So, to allow local-to-local migrations we close() sockets on the outgoing side as we process them. In addition to closing the connected socket we also have to close the associated listen()ing socket, because that can also cause an address conflict. To do that, we introduced the listening_sock field in the connection state, because we had no other way to find the right listening sockets. Now that we have the forwarding table, we have a complete list of listening sockets elsewhere. We can use that instead, to close all listening sockets on outbound migration, rather than just the ones that might conflict. This is cleaner and, importantly, saves a valuable 32-bits in the flow state structure. It does mean that there is a longer window where a peer attempting to connect during migration might get a Connection Refused. I think this is an acceptable trade-off for now: arguably we should not allow local-to-local migrations in any case, since the socket closes make it impossible to safely roll back migration as per the qemu model. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> [sbrivio: Adjust comment to tcp_flow_migrate_source()] Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--flow.c12
-rw-r--r--fwd.c21
-rw-r--r--fwd.h1
-rw-r--r--tcp.c11
-rw-r--r--tcp_conn.h3
5 files changed, 35 insertions, 13 deletions
diff --git a/flow.c b/flow.c
index fd4d5f3..5207143 100644
--- a/flow.c
+++ b/flow.c
@@ -1023,6 +1023,9 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret)
debug("...roll back migration");
+ if (fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP) < 0)
+ die("Failed to re-establish listening sockets");
+
foreach_established_tcp_flow(flow) {
if (FLOW_IDX(flow) >= bound)
break;
@@ -1147,6 +1150,15 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
return flow_migrate_source_rollback(c, FLOW_MAX, rc);
}
+ /* HACK: A local to local migrate will fail if the origin passt has the
+ * listening sockets still open when the destination passt tries to bind
+ * them. This does mean there's a window where we lost our listen()s,
+ * even if the migration is rolled back later. The only way to really
+ * fix that is to not allow local to local migration, which arguably we
+ * should (use namespaces for testing instead). */
+ debug("Stop listen()s");
+ fwd_listen_close(&c->tcp.fwd_in);
+
debug("Sending %u flows", ntohl(count));
if (!count)
diff --git a/fwd.c b/fwd.c
index edbeaf4..4052b79 100644
--- a/fwd.c
+++ b/fwd.c
@@ -654,6 +654,27 @@ int fwd_listen_sync(const struct ctx *c, const struct fwd_ports *fwd,
return 0;
}
+/** fwd_listen_close() - Close all listening sockets
+ * @fwd: Forwarding information
+ */
+void fwd_listen_close(const struct fwd_ports *fwd)
+{
+ unsigned i;
+
+ for (i = 0; i < fwd->count; i++) {
+ const struct fwd_rule *rule = &fwd->rules[i];
+ unsigned port;
+
+ for (port = rule->first; port <= rule->last; port++) {
+ int *fdp = &rule->socks[port - rule->first];
+ if (*fdp >= 0) {
+ close(*fdp);
+ *fdp = -1;
+ }
+ }
+ }
+}
+
/* See enum in kernel's include/net/tcp_states.h */
#define UDP_LISTEN 0x07
#define TCP_LISTEN 0x0a
diff --git a/fwd.h b/fwd.h
index a5dc89d..1607011 100644
--- a/fwd.h
+++ b/fwd.h
@@ -118,6 +118,7 @@ void fwd_scan_ports_timer(struct ctx * c, const struct timespec *now);
int fwd_listen_sync(const struct ctx *c, const struct fwd_ports *fwd,
uint8_t pif, uint8_t proto);
+void fwd_listen_close(const struct fwd_ports *fwd);
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated);
diff --git a/tcp.c b/tcp.c
index 9dd02cd..0a64892 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1704,7 +1704,6 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
conn->sock = s;
conn->timer = -1;
- conn->listening_sock = -1;
flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) {
flow_perror(flow, "Can't register with epoll");
@@ -2476,7 +2475,6 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
- struct tcp_tap_conn *conn;
union sockaddr_inany sa;
socklen_t sl = sizeof(sa);
struct flowside *ini;
@@ -2492,9 +2490,6 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
if (s < 0)
goto cancel;
- conn = (struct tcp_tap_conn *)flow;
- conn->listening_sock = ref.fd;
-
tcp_sock_set_nodelay(s);
/* FIXME: If useful: when the listening port has a specific bound
@@ -3393,7 +3388,7 @@ static int tcp_flow_repair_opt(const struct tcp_tap_conn *conn,
}
/**
- * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * tcp_flow_migrate_source() - Send data (flow table) for flow
* @fd: Descriptor for state migration
* @conn: Pointer to the TCP connection structure
*
@@ -3433,9 +3428,6 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
return rc;
}
- if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
- close(conn->listening_sock);
-
return 0;
}
@@ -3645,7 +3637,6 @@ static int tcp_flow_repair_connect(const struct ctx *c,
}
conn->timer = -1;
- conn->listening_sock = -1;
return 0;
}
diff --git a/tcp_conn.h b/tcp_conn.h
index 9c6ff9e..21cea10 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -18,7 +18,6 @@
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
* @events: Connection events, implying connection states
- * @listening_sock: Listening socket this socket was accept()ed from, or -1
* @timer: timerfd descriptor for timeout events
* @flags: Connection flags representing internal attributes
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
@@ -75,8 +74,6 @@ struct tcp_tap_conn {
#define CONN_STATE_BITS /* Setting these clears other flags */ \
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
- int listening_sock;
-
int timer :FD_REF_BITS;
uint8_t flags;