aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorLaurent Vivier <lvivier@redhat.com>2025-10-21 23:01:13 +0200
committerStefano Brivio <sbrivio@redhat.com>2025-10-30 15:32:50 +0100
commitdd5302dd7bf518aa2c50a9819ee06ea2d6fd0061 (patch)
tree3bad20da87579b02e9f848ebbbfbc89b51ea4370
parent05972c7c4daf0b2479a415bf7240944b999d9081 (diff)
downloadpasst-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.tar
passt-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.tar.gz
passt-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.tar.bz2
passt-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.tar.lz
passt-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.tar.xz
passt-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.tar.zst
passt-dd5302dd7bf518aa2c50a9819ee06ea2d6fd0061.zip
tcp, flow: Replace per-connection in_epoll flag with an epollid in flow_common
The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked whether a connection was registered with epoll, not which epoll instance. This limited flexibility for future multi-epoll support. Replace the boolean with an epollid field in flow_common that identifies which epoll instance the flow is registered with. Use FLOW_EPOLLID_INVALID to indicate when a flow is not registered with any epoll instance. An epoll_id_to_fd[] mapping table translates epoll ids to their corresponding epoll file descriptors. Add helper functions: - flow_in_epoll() to check if a flow is registered with epoll - flow_epollfd() to retrieve the epoll fd for a flow's thread - flow_epollid_register() to register an epoll fd with an epollid - flow_epollid_set() to set the epollid of a flow - flow_epollid_clear() to reset the epoll id of a flow This change also simplifies tcp_timer_ctl() and conn_flag_do() by removing the need to pass the context 'c', since the epoll fd is now directly accessible from the flow structure via flow_epollfd(). Add a defensive check at the beginning of tcp_flow_repair_queue() to avoid a false positive with "make clang-tidy": error: The 1st argument to 'send' is < 0 but should be >= 0 3230 | ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0); Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--flow.c61
-rw-r--r--flow.h17
-rw-r--r--passt.c1
-rw-r--r--tcp.c44
-rw-r--r--tcp_conn.h8
-rw-r--r--tcp_splice.c24
6 files changed, 116 insertions, 39 deletions
diff --git a/flow.c b/flow.c
index dd5f81b..278a9cf 100644
--- a/flow.c
+++ b/flow.c
@@ -116,6 +116,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
static const union flow *flow_new_entry; /* = NULL */
+static int epoll_id_to_fd[EPOLLFD_ID_MAX];
/* Hash table to index it */
#define FLOW_HASH_LOAD 70 /* % */
@@ -348,6 +349,63 @@ static void flow_set_state(struct flow_common *f, enum flow_state state)
}
/**
+ * flow_in_epoll() - Check if flow is registered with an epoll instance
+ * @f: Flow to check
+ *
+ * Return: true if flow is registered with epoll, false otherwise
+ */
+bool flow_in_epoll(const struct flow_common *f)
+{
+ return f->epollid != EPOLLFD_ID_INVALID;
+}
+
+/**
+ * flow_epollfd() - Get the epoll file descriptor for a flow
+ * @f: Flow to query
+ *
+ * Return: epoll file descriptor associated with the flow's thread
+ */
+int flow_epollfd(const struct flow_common *f)
+{
+ ASSERT(f->epollid < EPOLLFD_ID_MAX);
+
+ return epoll_id_to_fd[f->epollid];
+}
+
+/**
+ * flow_epollid_set() - Associate a flow with an epoll id
+ * @f: Flow to update
+ * @epollid: epoll id to associate with this flow
+ */
+void flow_epollid_set(struct flow_common *f, int epollid)
+{
+ ASSERT(epollid < EPOLLFD_ID_MAX);
+
+ f->epollid = epollid;
+}
+
+/**
+ * flow_epollid_clear() - Clear the flow epoll id
+ * @f: Flow to update
+ */
+void flow_epollid_clear(struct flow_common *f)
+{
+ f->epollid = EPOLLFD_ID_INVALID;
+}
+
+/**
+ * flow_epollid_register() - Initialize the epoll id -> fd mapping
+ * @epollid: epoll id to associate to
+ * @epollfd: epoll file descriptor for this epoll id
+ */
+void flow_epollid_register(int epollid, int epollfd)
+{
+ ASSERT(epollid < EPOLLFD_ID_MAX);
+
+ epoll_id_to_fd[epollid] = epollfd;
+}
+
+/**
* flow_initiate_() - Move flow to INI, setting pif[INISIDE]
* @flow: Flow to change state
* @pif: pif of the initiating side
@@ -550,6 +608,7 @@ union flow *flow_alloc(void)
flow_new_entry = flow;
memset(flow, 0, sizeof(*flow));
+ flow_epollid_clear(&flow->f);
flow_set_state(&flow->f, FLOW_STATE_NEW);
return flow;
@@ -829,7 +888,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
case FLOW_TCP_SPLICE:
closed = tcp_splice_flow_defer(&flow->tcp_splice);
if (!closed && timer)
- tcp_splice_timer(c, &flow->tcp_splice);
+ tcp_splice_timer(&flow->tcp_splice);
break;
case FLOW_PING4:
case FLOW_PING6:
diff --git a/flow.h b/flow.h
index c934f2e..b43b0b1 100644
--- a/flow.h
+++ b/flow.h
@@ -177,7 +177,8 @@ int flowside_connect(const struct ctx *c, int s,
* @type: Type of packet flow
* @pif[]: Interface for each side of the flow
* @side[]: Information for each side of the flow
- * @tap_omac: MAC address of remote endpoint as seen from the guest
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
+ * @epollid: epollfd identifier, or EPOLLFD_ID_INVALID
*/
struct flow_common {
#ifdef __GNUC__
@@ -193,9 +194,18 @@ struct flow_common {
#endif
uint8_t pif[SIDES];
struct flowside side[SIDES];
+
uint8_t tap_omac[6];
+
+#define EPOLLFD_ID_BITS 8
+ unsigned int epollid:EPOLLFD_ID_BITS;
};
+#define EPOLLFD_ID_DEFAULT 0
+#define EPOLLFD_ID_SIZE (1 << EPOLLFD_ID_BITS)
+#define EPOLLFD_ID_MAX (EPOLLFD_ID_SIZE - 1)
+#define EPOLLFD_ID_INVALID EPOLLFD_ID_MAX
+
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
#define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS)
@@ -251,6 +261,11 @@ flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
union flow;
void flow_init(void);
+bool flow_in_epoll(const struct flow_common *f);
+int flow_epollfd(const struct flow_common *f);
+void flow_epollid_set(struct flow_common *f, int epollid);
+void flow_epollid_clear(struct flow_common *f);
+void flow_epollid_register(int epollid, int epollfd);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
int fd);
diff --git a/passt.c b/passt.c
index 2e96d04..a8e06e1 100644
--- a/passt.c
+++ b/passt.c
@@ -286,6 +286,7 @@ int main(int argc, char **argv)
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
if (c.epollfd == -1)
die_perror("Failed to create epoll file descriptor");
+ flow_epollid_register(EPOLLFD_ID_DEFAULT, c.epollfd);
if (getrlimit(RLIMIT_NOFILE, &limit))
die_perror("Failed to get maximum value of open files limit");
diff --git a/tcp.c b/tcp.c
index 48dec5a..8cf4459 100644
--- a/tcp.c
+++ b/tcp.c
@@ -504,25 +504,27 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
*/
static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock,
.flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), };
struct epoll_event ev = { .data.u64 = ref.u64 };
+ int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
+ : c->epollfd;
if (conn->events == CLOSED) {
- if (conn->in_epoll)
- epoll_del(c->epollfd, conn->sock);
+ if (flow_in_epoll(&conn->f))
+ epoll_del(epollfd, conn->sock);
if (conn->timer != -1)
- epoll_del(c->epollfd, conn->timer);
+ epoll_del(epollfd, conn->timer);
return 0;
}
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(c->epollfd, m, conn->sock, &ev))
+ if (epoll_ctl(epollfd, m, conn->sock, &ev))
return -errno;
- conn->in_epoll = true;
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
if (conn->timer != -1) {
union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER,
@@ -531,7 +533,8 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
struct epoll_event ev_t = { .data.u64 = ref_t.u64,
.events = EPOLLIN | EPOLLET };
- if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t))
+ if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD,
+ conn->timer, &ev_t))
return -errno;
}
@@ -540,12 +543,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
/**
* tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed
- * @c: Execution context
* @conn: Connection pointer
*
* #syscalls timerfd_create timerfd_settime
*/
-static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_timer_ctl(struct tcp_tap_conn *conn)
{
struct itimerspec it = { { 0 }, { 0 } };
@@ -558,6 +560,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
.flow = FLOW_IDX(conn) };
struct epoll_event ev = { .data.u64 = ref.u64,
.events = EPOLLIN | EPOLLET };
+ int epollfd = flow_epollfd(&conn->f);
int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0);
@@ -570,7 +573,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
}
conn->timer = fd;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
+ if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) {
flow_dbg_perror(conn, "failed to add timer");
close(conn->timer);
conn->timer = -1;
@@ -628,7 +631,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
* flags and factor this into the logic below.
*/
if (flag == ACK_FROM_TAP_DUE)
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
return;
}
@@ -644,7 +647,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE ||
(flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) ||
(flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE)))
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
}
/**
@@ -699,7 +702,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_epoll_ctl(c, conn);
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED))
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
}
/**
@@ -1767,7 +1770,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
seq, conn->seq_from_tap);
tcp_send_flag(c, conn, ACK);
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
if (p->count == 1) {
tcp_tap_window_update(c, conn,
@@ -2418,7 +2421,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
if (conn->flags & ACK_TO_TAP_DUE) {
tcp_send_flag(c, conn, ACK_IF_NEEDED);
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
} else if (conn->flags & ACK_FROM_TAP_DUE) {
if (!(conn->events & ESTABLISHED)) {
flow_dbg(conn, "handshake timeout");
@@ -2440,7 +2443,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
return;
tcp_data_from_sock(c, conn);
- tcp_timer_ctl(c, conn);
+ tcp_timer_ctl(conn);
}
} else {
struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } };
@@ -3235,6 +3238,11 @@ static int tcp_flow_repair_queue(const struct tcp_tap_conn *conn,
size_t chunk = len;
uint8_t *p = buf;
+ if (conn->sock < 0) {
+ flow_err(conn, "Invalid socket descriptor for repair queue");
+ return -EBADF;
+ }
+
while (len > 0) {
ssize_t rc = send(conn->sock, p, MIN(len, chunk), 0);
@@ -3488,7 +3496,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c,
if (c->migrate_no_linger)
close(s);
else
- epoll_del(c->epollfd, s);
+ epoll_del(flow_epollfd(&conn->f), s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
* based on the end of the queues.
@@ -3637,7 +3645,7 @@ static int tcp_flow_repair_connect(const struct ctx *c,
return rc;
}
- conn->in_epoll = 0;
+ flow_epollid_clear(&conn->f);
conn->timer = -1;
conn->listening_sock = -1;
diff --git a/tcp_conn.h b/tcp_conn.h
index 38b5c54..8133312 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -12,7 +12,6 @@
/**
* struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
* @f: Generic flow information
- * @in_epoll: Is the connection in the epoll set?
* @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
* @ws_from_tap: Window scaling factor advertised from tap/guest
* @ws_to_tap: Window scaling factor advertised to tap/guest
@@ -36,8 +35,6 @@ struct tcp_tap_conn {
/* Must be first element */
struct flow_common f;
- bool in_epoll :1;
-
#define TCP_RETRANS_BITS 3
unsigned int retrans :TCP_RETRANS_BITS;
#define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS)
@@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext {
* @written: Bytes written (not fully written from one other side read)
* @events: Events observed/actions performed on connection
* @flags: Connection flags (attributes, not events)
- * @in_epoll: Is the connection in the epoll set?
*/
struct tcp_splice_conn {
/* Must be first element */
@@ -220,8 +216,6 @@ struct tcp_splice_conn {
#define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0))
#define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2))
#define CLOSING BIT(4)
-
- bool in_epoll :1;
};
/* Socket pools */
@@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
+void tcp_splice_timer(struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
int tcp_conn_sock(sa_family_t af);
int tcp_sock_refill_pool(int pool[], sa_family_t af);
diff --git a/tcp_splice.c b/tcp_splice.c
index 6f21184..5efd859 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -149,7 +149,9 @@ static void tcp_splice_conn_epoll_events(uint16_t events,
static int tcp_splice_epoll_ctl(const struct ctx *c,
struct tcp_splice_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
+ int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f)
+ : c->epollfd;
+ int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
const union epoll_ref ref[SIDES] = {
{ .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
.flowside = FLOW_SIDX(conn, 0) },
@@ -161,25 +163,24 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
tcp_splice_conn_epoll_events(conn->events, ev);
- if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
- epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
+
+ if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) ||
+ epoll_ctl(epollfd, m, conn->s[1], &ev[1])) {
int ret = -errno;
flow_perror(conn, "ERROR on epoll_ctl()");
return ret;
}
-
- conn->in_epoll = true;
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
return 0;
}
/**
* conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
- * @c: Execution context
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
+static void conn_flag_do(struct tcp_splice_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
@@ -204,15 +205,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
- epoll_del(c->epollfd, conn->s[0]);
- epoll_del(c->epollfd, conn->s[1]);
+ epoll_del(flow_epollfd(&conn->f), conn->s[0]);
+ epoll_del(flow_epollfd(&conn->f), conn->s[1]);
}
}
#define conn_flag(c, conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
- conn_flag_do(c, conn, flag); \
+ conn_flag_do(conn, flag); \
} while (0)
/**
@@ -751,10 +752,9 @@ void tcp_splice_init(struct ctx *c)
/**
* tcp_splice_timer() - Timer for spliced connections
- * @c: Execution context
* @conn: Connection to handle
*/
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
+void tcp_splice_timer(struct tcp_splice_conn *conn)
{
unsigned sidei;