aboutgitcodebugslistschat
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2024-07-18 15:26:35 +1000
committerStefano Brivio <sbrivio@redhat.com>2024-07-19 18:32:59 +0200
commitacca4235c46f0b2ecceb991d75a09616309afb3c (patch)
tree92fb7375f3481b958f96c8124db6f5b7dd469874
parent163a339214dd34696ce485930f35ed21c86057f0 (diff)
downloadpasst-acca4235c46f0b2ecceb991d75a09616309afb3c.tar
passt-acca4235c46f0b2ecceb991d75a09616309afb3c.tar.gz
passt-acca4235c46f0b2ecceb991d75a09616309afb3c.tar.bz2
passt-acca4235c46f0b2ecceb991d75a09616309afb3c.tar.lz
passt-acca4235c46f0b2ecceb991d75a09616309afb3c.tar.xz
passt-acca4235c46f0b2ecceb991d75a09616309afb3c.tar.zst
passt-acca4235c46f0b2ecceb991d75a09616309afb3c.zip
flow, tcp: Generalise TCP hash table to general flow hash table
Move the data structures and helper functions for the TCP hash table to flow.c, making it a general hash table indexing sides of flows. This is largely code motion and straightforward renames. There are two semantic changes: * flow_lookup_af() now needs to verify that the entry has a matching protocol and interface as well as matching addresses and ports. * We double the size of the hash table, because it's now at least theoretically possible for both sides of each flow to be hashed. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
-rw-r--r--flow.c155
-rw-r--r--flow.h11
-rw-r--r--flow_table.h3
-rw-r--r--tcp.c147
-rw-r--r--tcp_internal.h1
5 files changed, 172 insertions, 145 deletions
diff --git a/flow.c b/flow.c
index e70e3cb..b291f25 100644
--- a/flow.c
+++ b/flow.c
@@ -105,6 +105,16 @@ unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
static const union flow *flow_new_entry; /* = NULL */
+/* Hash table to index it */
+#define FLOW_HASH_LOAD 70 /* % */
+#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
+
+/* Table for lookup from flowside information */
+static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
+
+static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
+"Safe linear probing requires hash table with more entries than the number of sides in the flow table");
+
/* Last time the flow timers ran */
static struct timespec flow_timer_run;
@@ -116,9 +126,9 @@ static struct timespec flow_timer_run;
* @faddr: Forwarding address (pointer to in_addr or in6_addr)
* @fport: Forwarding port
*/
-void flowside_from_af(struct flowside *side, sa_family_t af,
- const void *eaddr, in_port_t eport,
- const void *faddr, in_port_t fport)
+static void flowside_from_af(struct flowside *side, sa_family_t af,
+ const void *eaddr, in_port_t eport,
+ const void *faddr, in_port_t fport)
{
if (faddr)
inany_from_af(&side->faddr, af, faddr);
@@ -410,8 +420,8 @@ void flow_alloc_cancel(union flow *flow)
*
* Return: hash value
*/
-uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
- const struct flowside *side)
+static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
+ const struct flowside *side)
{
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
@@ -431,6 +441,136 @@ uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
}
/**
+ * flow_sidx_hash() - Calculate hash value for given side of a given flow
+ * @c: Execution context
+ * @sidx: Flow & side index to get hash for
+ *
+ * Return: hash value, of the flow & side represented by @sidx
+ */
+static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
+{
+ const struct flow_common *f = &flow_at_sidx(sidx)->f;
+ return flow_hash(c, FLOW_PROTO(f),
+ f->pif[sidx.sidei], &f->side[sidx.sidei]);
+}
+
+/**
+ * flow_hash_probe() - Find hash bucket for a flow
+ * @c: Execution context
+ * @sidx: Flow and side to find bucket for
+ *
+ * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * suitable free bucket for it.
+ */
+static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
+{
+ unsigned b = flow_sidx_hash(c, sidx) % FLOW_HASH_SIZE;
+
+ /* Linear probing */
+ while (flow_sidx_valid(flow_hashtab[b]) &&
+ !flow_sidx_eq(flow_hashtab[b], sidx))
+ b = mod_sub(b, 1, FLOW_HASH_SIZE);
+
+ return b;
+}
+
+/**
+ * flow_hash_insert() - Insert side of a flow into into hash table
+ * @c: Execution context
+ * @sidx: Flow & side index
+ */
+void flow_hash_insert(const struct ctx *c, flow_sidx_t sidx)
+{
+ unsigned b = flow_hash_probe(c, sidx);
+
+ flow_hashtab[b] = sidx;
+ flow_dbg(flow_at_sidx(sidx), "Side %u hash table insert: bucket: %u",
+ sidx.sidei, b);
+}
+
+/**
+ * flow_hash_remove() - Drop side of a flow from the hash table
+ * @c: Execution context
+ * @sidx: Side of flow to remove
+ */
+void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx)
+{
+ unsigned b = flow_hash_probe(c, sidx), s;
+
+ if (!flow_sidx_valid(flow_hashtab[b]))
+ return; /* Redundant remove */
+
+ flow_dbg(flow_at_sidx(sidx), "Side %u hash table remove: bucket: %u",
+ sidx.sidei, b);
+
+ /* Scan the remainder of the cluster */
+ for (s = mod_sub(b, 1, FLOW_HASH_SIZE);
+ flow_sidx_valid(flow_hashtab[s]);
+ s = mod_sub(s, 1, FLOW_HASH_SIZE)) {
+ unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE;
+
+ if (!mod_between(h, s, b, FLOW_HASH_SIZE)) {
+ /* flow_hashtab[s] can live in flow_hashtab[b]'s slot */
+ debug("hash table remove: shuffle %u -> %u", s, b);
+ flow_hashtab[b] = flow_hashtab[s];
+ b = s;
+ }
+ }
+
+ flow_hashtab[b] = FLOW_SIDX_NONE;
+}
+
+/**
+ * flowside_lookup() - Look for a matching flowside in the flow table
+ * @c: Execution context
+ * @proto: Protocol of the flow (IP L4 protocol number)
+ * @pif: pif to look for in the table
+ * @side: Flowside to look for in the table
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
+ uint8_t pif, const struct flowside *side)
+{
+ flow_sidx_t sidx;
+ union flow *flow;
+ unsigned b;
+
+ b = flow_hash(c, proto, pif, side) % FLOW_HASH_SIZE;
+ while ((sidx = flow_hashtab[b], flow = flow_at_sidx(sidx)) &&
+ !(FLOW_PROTO(&flow->f) == proto &&
+ flow->f.pif[sidx.sidei] == pif &&
+ flowside_eq(&flow->f.side[sidx.sidei], side)))
+ b = (b + 1) % FLOW_HASH_SIZE;
+
+ return flow_hashtab[b];
+}
+
+/**
+ * flow_lookup_af() - Look up a flow given addressing information
+ * @c: Execution context
+ * @proto: Protocol of the flow (IP L4 protocol number)
+ * @pif: Interface of the flow
+ * @af: Address family, AF_INET or AF_INET6
+ * @eaddr: Guest side endpoint address (guest local address)
+ * @faddr: Guest side forwarding address (guest remote address)
+ * @eport: Guest side endpoint port (guest local port)
+ * @fport: Guest side forwarding port (guest remote port)
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+flow_sidx_t flow_lookup_af(const struct ctx *c,
+ uint8_t proto, uint8_t pif, sa_family_t af,
+ const void *eaddr, const void *faddr,
+ in_port_t eport, in_port_t fport)
+{
+ struct flowside side;
+
+ flowside_from_af(&side, af, eaddr, eport, faddr, fport);
+ return flowside_lookup(c, proto, pif, &side);
+}
+
+/**
* flow_defer_handler() - Handler for per-flow deferred and timed tasks
* @c: Execution context
* @now: Current timestamp
@@ -543,7 +683,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
*/
void flow_init(void)
{
+ unsigned b;
+
/* Initial state is a single free cluster containing the whole table */
flowtab[0].free.n = FLOW_MAX;
flowtab[0].free.next = FLOW_MAX;
+
+ for (b = 0; b < FLOW_HASH_SIZE; b++)
+ flow_hashtab[b] = FLOW_SIDX_NONE;
}
diff --git a/flow.h b/flow.h
index a0550dc..fcb4121 100644
--- a/flow.h
+++ b/flow.h
@@ -164,10 +164,6 @@ static inline bool flowside_eq(const struct flowside *left,
left->fport == right->fport;
}
-void flowside_from_af(struct flowside *side, sa_family_t af,
- const void *eaddr, in_port_t eport,
- const void *faddr, in_port_t fport);
-
/**
* struct flow_common - Common fields for packet flows
* @state: State of the flow table entry
@@ -233,6 +229,13 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b)
return (a.flowi == b.flowi) && (a.sidei == b.sidei);
}
+void flow_hash_insert(const struct ctx *c, flow_sidx_t sidx);
+void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx);
+flow_sidx_t flow_lookup_af(const struct ctx *c,
+ uint8_t proto, uint8_t pif, sa_family_t af,
+ const void *eaddr, const void *faddr,
+ in_port_t eport, in_port_t fport);
+
union flow;
void flow_init(void);
diff --git a/flow_table.h b/flow_table.h
index b3cb546..aabdbb7 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -146,7 +146,4 @@ void flow_activate(struct flow_common *f);
#define FLOW_ACTIVATE(flow_) \
(flow_activate(&(flow_)->f))
-uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
- const struct flowside *side);
-
#endif /* FLOW_TABLE_H */
diff --git a/tcp.c b/tcp.c
index 5c8c8d1..0964855 100644
--- a/tcp.c
+++ b/tcp.c
@@ -305,9 +305,6 @@
#include "tcp_internal.h"
#include "tcp_buf.h"
-#define TCP_HASH_TABLE_LOAD 70 /* % */
-#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
-
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
@@ -377,12 +374,6 @@ bool peek_offset_cap;
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
-/* Table for lookup from flowside information */
-static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE];
-
-static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
- "Safe linear probing requires hash table larger than connection table");
-
/* Pools for pre-opened sockets (in init) */
int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
@@ -605,9 +596,6 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
}
-static void tcp_hash_remove(const struct ctx *c,
- const struct tcp_tap_conn *conn);
-
/**
* conn_event_do() - Set and log connection events, update epoll state
* @c: Execution context
@@ -653,7 +641,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
num == -1 ? "CLOSED" : tcp_event_str[num]);
if (event == CLOSED)
- tcp_hash_remove(c, conn);
+ flow_hash_remove(c, TAP_SIDX(conn));
else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD))
conn_flag(c, conn, ACTIVE_CLOSE);
else
@@ -853,117 +841,6 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
}
/**
- * tcp_conn_hash() - Calculate hash bucket of an existing connection
- * @c: Execution context
- * @conn: Connection
- *
- * Return: hash value, needs to be adjusted for table size
- */
-static uint64_t tcp_conn_hash(const struct ctx *c,
- const struct tcp_tap_conn *conn)
-{
- const struct flowside *tapside = TAPFLOW(conn);
-
- return flow_hash(c, IPPROTO_TCP, conn->f.pif[TAPSIDE(conn)], tapside);
-}
-
-/**
- * tcp_hash_probe() - Find hash bucket for a connection
- * @c: Execution context
- * @conn: Connection to find bucket for
- *
- * Return: If @conn is in the table, its current bucket, otherwise a suitable
- * free bucket for it.
- */
-static inline unsigned tcp_hash_probe(const struct ctx *c,
- const struct tcp_tap_conn *conn)
-{
- unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE;
- flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE(conn));
-
- /* Linear probing */
- while (flow_sidx_valid(tc_hash[b]) && !flow_sidx_eq(tc_hash[b], sidx))
- b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE);
-
- return b;
-}
-
-/**
- * tcp_hash_insert() - Insert connection into hash table, chain link
- * @c: Execution context
- * @conn: Connection pointer
- */
-static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn)
-{
- unsigned b = tcp_hash_probe(c, conn);
-
- tc_hash[b] = FLOW_SIDX(conn, TAPSIDE(conn));
- flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b);
-}
-
-/**
- * tcp_hash_remove() - Drop connection from hash table, chain unlink
- * @c: Execution context
- * @conn: Connection pointer
- */
-static void tcp_hash_remove(const struct ctx *c,
- const struct tcp_tap_conn *conn)
-{
- unsigned b = tcp_hash_probe(c, conn), s;
- union flow *flow;
-
- if (!flow_sidx_valid(tc_hash[b]))
- return; /* Redundant remove */
-
- flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b);
-
- /* Scan the remainder of the cluster */
- for (s = mod_sub(b, 1, TCP_HASH_TABLE_SIZE);
- (flow = flow_at_sidx(tc_hash[s]));
- s = mod_sub(s, 1, TCP_HASH_TABLE_SIZE)) {
- unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE;
-
- if (!mod_between(h, s, b, TCP_HASH_TABLE_SIZE)) {
- /* tc_hash[s] can live in tc_hash[b]'s slot */
- debug("hash table remove: shuffle %u -> %u", s, b);
- tc_hash[b] = tc_hash[s];
- b = s;
- }
- }
-
- tc_hash[b] = FLOW_SIDX_NONE;
-}
-
-/**
- * tcp_hash_lookup() - Look up connection given remote address and ports
- * @c: Execution context
- * @af: Address family, AF_INET or AF_INET6
- * @eaddr: Guest side endpoint address (guest local address)
- * @faddr: Guest side forwarding address (guest remote address)
- * @eport: Guest side endpoint port (guest local port)
- * @fport: Guest side forwarding port (guest remote port)
- *
- * Return: connection pointer, if found, -ENOENT otherwise
- */
-static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, sa_family_t af,
- const void *eaddr, const void *faddr,
- in_port_t eport, in_port_t fport)
-{
- struct flowside side;
- union flow *flow;
- unsigned b;
-
- flowside_from_af(&side, af, eaddr, eport, faddr, fport);
-
- b = flow_hash(c, IPPROTO_TCP, PIF_TAP, &side) % TCP_HASH_TABLE_SIZE;
- while ((flow = flow_at_sidx(tc_hash[b])) &&
- !flowside_eq(&flow->f.side[TAPSIDE(flow)], &side))
- b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE);
-
- return &flow->tcp;
-}
-
-/**
* tcp_flow_defer() - Deferred per-flow handling (clean up closed connections)
* @conn: Connection to handle
*
@@ -1710,7 +1587,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
tcp_seq_init(c, conn, now);
conn->seq_ack_from_tap = conn->seq_to_tap;
- tcp_hash_insert(c, conn);
+ flow_hash_insert(c, TAP_SIDX(conn));
tcp_bind_outbound(c, conn, s);
@@ -2047,6 +1924,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const struct tcphdr *th;
size_t optlen, len;
const char *opts;
+ union flow *flow;
+ flow_sidx_t sidx;
int ack_due = 0;
int count;
@@ -2062,17 +1941,22 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL);
opts = packet_get(p, idx, sizeof(*th), optlen, NULL);
- conn = tcp_hash_lookup(c, af, saddr, daddr,
- ntohs(th->source), ntohs(th->dest));
+ sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr,
+ ntohs(th->source), ntohs(th->dest));
+ flow = flow_at_sidx(sidx);
/* New connection from tap */
- if (!conn) {
+ if (!flow) {
if (opts && th->syn && !th->ack)
tcp_conn_from_tap(c, af, saddr, daddr, th,
opts, optlen, now);
return 1;
}
+ ASSERT(flow->f.type == FLOW_TCP);
+ ASSERT(pif_at_sidx(sidx) == PIF_TAP);
+ conn = &flow->tcp;
+
flow_trace(conn, "packet length %zu from tap", len);
if (th->rst) {
@@ -2250,7 +2134,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
conn_event(c, conn, SOCK_ACCEPTED);
tcp_seq_init(c, conn, now);
- tcp_hash_insert(c, conn);
+ flow_hash_insert(c, TAP_SIDX(conn));
conn->seq_ack_from_tap = conn->seq_to_tap;
@@ -2652,14 +2536,11 @@ static void tcp_sock_refill_init(const struct ctx *c)
*/
int tcp_init(struct ctx *c)
{
- unsigned int b, optv = 0;
+ unsigned int optv = 0;
int s;
ASSERT(!c->no_tcp);
- for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
- tc_hash[b] = FLOW_SIDX_NONE;
-
if (c->ifi4)
tcp_sock4_iov_init(c);
diff --git a/tcp_internal.h b/tcp_internal.h
index ac6d4b2..8b60aab 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -42,6 +42,7 @@
#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
+#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->faddr))
#define CONN_V6(conn) (!CONN_V4(conn))