aboutgitcodebugslistschat
path: root/flow.c
diff options
context:
space:
mode:
Diffstat (limited to 'flow.c')
-rw-r--r--flow.c693
1 files changed, 627 insertions, 66 deletions
diff --git a/flow.c b/flow.c
index 5bb24cc..1ea112b 100644
--- a/flow.c
+++ b/flow.c
@@ -5,9 +5,11 @@
* Tracking for logical "flows" of packets.
*/
+#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
+#include <sched.h>
#include <string.h>
#include "util.h"
@@ -18,10 +20,24 @@
#include "flow.h"
#include "flow_table.h"
+const char *flow_state_str[] = {
+ [FLOW_STATE_FREE] = "FREE",
+ [FLOW_STATE_NEW] = "NEW",
+ [FLOW_STATE_INI] = "INI",
+ [FLOW_STATE_TGT] = "TGT",
+ [FLOW_STATE_TYPED] = "TYPED",
+ [FLOW_STATE_ACTIVE] = "ACTIVE",
+};
+static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES,
+ "flow_state_str[] doesn't match enum flow_state");
+
const char *flow_type_str[] = {
[FLOW_TYPE_NONE] = "<none>",
[FLOW_TCP] = "TCP connection",
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
+ [FLOW_PING4] = "ICMP ping sequence",
+ [FLOW_PING6] = "ICMPv6 ping sequence",
+ [FLOW_UDP] = "UDP flow",
};
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
"flow_type_str[] doesn't match enum flow_type");
@@ -29,6 +45,9 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
const uint8_t flow_proto[] = {
[FLOW_TCP] = IPPROTO_TCP,
[FLOW_TCP_SPLICE] = IPPROTO_TCP,
+ [FLOW_PING4] = IPPROTO_ICMP,
+ [FLOW_PING6] = IPPROTO_ICMPV6,
+ [FLOW_UDP] = IPPROTO_UDP,
};
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
"flow_proto[] doesn't match enum flow_type");
@@ -36,46 +55,6 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
/* Global Flow Table */
/**
- * DOC: Theory of Operation - flow entry life cycle
- *
- * An individual flow table entry moves through these logical states, usually in
- * this order.
- *
- * FREE - Part of the general pool of free flow table entries
- * Operations:
- * - flow_alloc() finds an entry and moves it to ALLOC state
- *
- * ALLOC - A tentatively allocated entry
- * Operations:
- * - flow_alloc_cancel() returns the entry to FREE state
- * - FLOW_START() set the entry's type and moves to START state
- * Caveats:
- * - It's not safe to write fields in the flow entry
- * - It's not safe to allocate further entries with flow_alloc()
- * - It's not safe to return to the main epoll loop (use FLOW_START()
- * to move to START state before doing so)
- * - It's not safe to use flow_*() logging functions
- *
- * START - An entry being prepared by flow type specific code
- * Operations:
- * - Flow type specific fields may be accessed
- * - flow_*() logging functions
- * - flow_alloc_cancel() returns the entry to FREE state
- * Caveats:
- * - Returning to the main epoll loop or allocating another entry
- * with flow_alloc() implicitly moves the entry to ACTIVE state.
- *
- * ACTIVE - An active flow entry managed by flow type specific code
- * Operations:
- * - Flow type specific fields may be accessed
- * - flow_*() logging functions
- * - Flow may be expired by returning 'true' from flow type specific
- * deferred or timer handler. This will return it to FREE state.
- * Caveats:
- * - It's not safe to call flow_alloc_cancel()
- */
-
-/**
* DOC: Theory of Operation - allocating and freeing flow entries
*
* Flows are entries in flowtab[]. We need to routinely scan the whole table to
@@ -128,10 +107,156 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
+static const union flow *flow_new_entry; /* = NULL */
+
+/* Hash table to index it */
+#define FLOW_HASH_LOAD 70 /* % */
+#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
+
+/* Table for lookup from flowside information */
+static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
+
+static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
+"Safe linear probing requires hash table with more entries than the number of sides in the flow table");
/* Last time the flow timers ran */
static struct timespec flow_timer_run;
+/** flowside_from_af() - Initialise flowside from addresses
+ * @side: flowside to initialise
+ * @af: Address family (AF_INET or AF_INET6)
+ * @eaddr: Endpoint address (pointer to in_addr or in6_addr)
+ * @eport: Endpoint port
+ * @oaddr: Our address (pointer to in_addr or in6_addr)
+ * @oport: Our port
+ */
+static void flowside_from_af(struct flowside *side, sa_family_t af,
+ const void *eaddr, in_port_t eport,
+ const void *oaddr, in_port_t oport)
+{
+ if (oaddr)
+ inany_from_af(&side->oaddr, af, oaddr);
+ else
+ side->oaddr = inany_any6;
+ side->oport = oport;
+
+ if (eaddr)
+ inany_from_af(&side->eaddr, af, eaddr);
+ else
+ side->eaddr = inany_any6;
+ side->eport = eport;
+}
+
+/**
+ * struct flowside_sock_args - Parameters for flowside_sock_splice()
+ * @c: Execution context
+ * @fd: Filled in with new socket fd
+ * @err: Filled in with errno if something failed
+ * @type: Socket epoll type
+ * @sa: Socket address
+ * @sl: Length of @sa
+ * @data: epoll reference data
+ */
+struct flowside_sock_args {
+ const struct ctx *c;
+ int fd;
+ int err;
+ enum epoll_type type;
+ const struct sockaddr *sa;
+ socklen_t sl;
+ const char *path;
+ uint32_t data;
+};
+
+/** flowside_sock_splice() - Create and bind socket for PIF_SPLICE based on flowside
+ * @arg: Argument as a struct flowside_sock_args
+ *
+ * Return: 0
+ */
+static int flowside_sock_splice(void *arg)
+{
+ struct flowside_sock_args *a = arg;
+
+ ns_enter(a->c);
+
+ a->fd = sock_l4_sa(a->c, a->type, a->sa, a->sl, NULL,
+ a->sa->sa_family == AF_INET6, a->data);
+ a->err = errno;
+
+ return 0;
+}
+
+/** flowside_sock_l4() - Create and bind socket based on flowside
+ * @c: Execution context
+ * @type: Socket epoll type
+ * @pif: Interface for this socket
+ * @tgt: Target flowside
+ * @data: epoll reference portion for protocol handlers
+ *
+ * Return: socket fd of protocol @proto bound to our address and port from @tgt
+ * (if specified).
+ */
+int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif,
+ const struct flowside *tgt, uint32_t data)
+{
+ const char *ifname = NULL;
+ union sockaddr_inany sa;
+ socklen_t sl;
+
+ ASSERT(pif_is_socket(pif));
+
+ pif_sockaddr(c, &sa, &sl, pif, &tgt->oaddr, tgt->oport);
+
+ switch (pif) {
+ case PIF_HOST:
+ if (inany_is_loopback(&tgt->oaddr))
+ ifname = NULL;
+ else if (sa.sa_family == AF_INET)
+ ifname = c->ip4.ifname_out;
+ else if (sa.sa_family == AF_INET6)
+ ifname = c->ip6.ifname_out;
+
+ return sock_l4_sa(c, type, &sa, sl, ifname,
+ sa.sa_family == AF_INET6, data);
+
+ case PIF_SPLICE: {
+ struct flowside_sock_args args = {
+ .c = c, .type = type,
+ .sa = &sa.sa, .sl = sl, .data = data,
+ };
+ NS_CALL(flowside_sock_splice, &args);
+ errno = args.err;
+ return args.fd;
+ }
+
+ default:
+ /* If we add new socket pifs, they'll need to be implemented
+ * here
+ */
+ ASSERT(0);
+ }
+}
+
+/** flowside_connect() - Connect a socket based on flowside
+ * @c: Execution context
+ * @s: Socket to connect
+ * @pif: Target pif
+ * @tgt: Target flowside
+ *
+ * Connect @s to the endpoint address and port from @tgt.
+ *
+ * Return: 0 on success, negative on error
+ */
+int flowside_connect(const struct ctx *c, int s,
+ uint8_t pif, const struct flowside *tgt)
+{
+ union sockaddr_inany sa;
+ socklen_t sl;
+
+ pif_sockaddr(c, &sa, &sl, pif, &tgt->eaddr, tgt->eport);
+ return connect(s, &sa.sa, sl);
+}
+
/** flow_log_ - Log flow-related message
* @f: flow the message is related to
* @pri: Log priority
@@ -140,6 +265,7 @@ static struct timespec flow_timer_run;
*/
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
{
+ const char *type_or_state;
char msg[BUFSIZ];
va_list args;
@@ -147,40 +273,221 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
(void)vsnprintf(msg, sizeof(msg), fmt, args);
va_end(args);
- logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
+ /* Show type if it's set, otherwise the state */
+ if (f->state < FLOW_STATE_TYPED)
+ type_or_state = FLOW_STATE(f);
+ else
+ type_or_state = FLOW_TYPE(f);
+
+ logmsg(true, false, pri,
+ "Flow %u (%s): %s", flow_idx(f), type_or_state, msg);
+}
+
+/** flow_log_details_() - Log the details of a flow
+ * @f: flow to log
+ * @pri: Log priority
+ * @state: State to log details according to
+ *
+ * Logs the details of the flow: endpoints, interfaces, type etc.
+ */
+void flow_log_details_(const struct flow_common *f, int pri,
+ enum flow_state state)
+{
+ char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN];
+ char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN];
+ const struct flowside *ini = &f->side[INISIDE];
+ const struct flowside *tgt = &f->side[TGTSIDE];
+
+ if (state >= FLOW_STATE_TGT)
+ flow_log_(f, pri,
+ "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu",
+ pif_name(f->pif[INISIDE]),
+ inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
+ ini->eport,
+ inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
+ ini->oport,
+ pif_name(f->pif[TGTSIDE]),
+ inany_ntop(&tgt->oaddr, fstr1, sizeof(fstr1)),
+ tgt->oport,
+ inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)),
+ tgt->eport);
+ else if (state >= FLOW_STATE_INI)
+ flow_log_(f, pri, "%s [%s]:%hu -> [%s]:%hu => ?",
+ pif_name(f->pif[INISIDE]),
+ inany_ntop(&ini->eaddr, estr0, sizeof(estr0)),
+ ini->eport,
+ inany_ntop(&ini->oaddr, fstr0, sizeof(fstr0)),
+ ini->oport);
+}
+
+/**
+ * flow_set_state() - Change flow's state
+ * @f: Flow changing state
+ * @state: New state
+ */
+static void flow_set_state(struct flow_common *f, enum flow_state state)
+{
+ uint8_t oldstate = f->state;
+
+ ASSERT(state < FLOW_NUM_STATES);
+ ASSERT(oldstate < FLOW_NUM_STATES);
+
+ f->state = state;
+ flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+ FLOW_STATE(f));
+
+ flow_log_details_(f, LOG_DEBUG, MAX(state, oldstate));
+}
+
+/**
+ * flow_initiate_() - Move flow to INI, setting pif[INISIDE]
+ * @flow: Flow to change state
+ * @pif: pif of the initiating side
+ */
+static void flow_initiate_(union flow *flow, uint8_t pif)
+{
+ struct flow_common *f = &flow->f;
+
+ ASSERT(pif != PIF_NONE);
+ ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW);
+ ASSERT(f->type == FLOW_TYPE_NONE);
+ ASSERT(f->pif[INISIDE] == PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
+
+ f->pif[INISIDE] = pif;
+ flow_set_state(f, FLOW_STATE_INI);
+}
+
+/**
+ * flow_initiate_af() - Move flow to INI, setting INISIDE details
+ * @flow: Flow to change state
+ * @pif: pif of the initiating side
+ * @af: Address family of @saddr and @daddr
+ * @saddr: Source address (pointer to in_addr or in6_addr)
+ * @sport: Endpoint port
+ * @daddr: Destination address (pointer to in_addr or in6_addr)
+ * @dport: Destination port
+ *
+ * Return: pointer to the initiating flowside information
+ */
+const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif,
+ sa_family_t af,
+ const void *saddr, in_port_t sport,
+ const void *daddr, in_port_t dport)
+{
+ struct flowside *ini = &flow->f.side[INISIDE];
+
+ flowside_from_af(ini, af, saddr, sport, daddr, dport);
+ flow_initiate_(flow, pif);
+ return ini;
}
/**
- * flow_start() - Set flow type for new flow and log
- * @flow: Flow to set type for
- * @type: Type for new flow
- * @iniside: Which side initiated the new flow
+ * flow_initiate_sa() - Move flow to INI, setting INISIDE details
+ * @flow: Flow to change state
+ * @pif: pif of the initiating side
+ * @ssa: Source socket address
+ * @dport: Destination port
*
- * Return: @flow
+ * Return: pointer to the initiating flowside information
+ */
+const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
+ const union sockaddr_inany *ssa,
+ in_port_t dport)
+{
+ struct flowside *ini = &flow->f.side[INISIDE];
+
+ inany_from_sockaddr(&ini->eaddr, &ini->eport, ssa);
+ if (inany_v4(&ini->eaddr))
+ ini->oaddr = inany_any4;
+ else
+ ini->oaddr = inany_any6;
+ ini->oport = dport;
+ flow_initiate_(flow, pif);
+ return ini;
+}
+
+/**
+ * flow_target() - Determine where flow should forward to, and move to TGT
+ * @c: Execution context
+ * @flow: Flow to forward
+ * @proto: Protocol
*
- * Should be called before setting any flow type specific fields in the flow
- * table entry.
+ * Return: pointer to the target flowside information
+ */
+const struct flowside *flow_target(const struct ctx *c, union flow *flow,
+ uint8_t proto)
+{
+ char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
+ struct flow_common *f = &flow->f;
+ const struct flowside *ini = &f->side[INISIDE];
+ struct flowside *tgt = &f->side[TGTSIDE];
+ uint8_t tgtpif = PIF_NONE;
+
+ ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI);
+ ASSERT(f->type == FLOW_TYPE_NONE);
+ ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE);
+ ASSERT(flow->f.state == FLOW_STATE_INI);
+
+ switch (f->pif[INISIDE]) {
+ case PIF_TAP:
+ tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
+ break;
+
+ case PIF_SPLICE:
+ tgtpif = fwd_nat_from_splice(c, proto, ini, tgt);
+ break;
+
+ case PIF_HOST:
+ tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
+ break;
+
+ default:
+ flow_err(flow, "No rules to forward %s [%s]:%hu -> [%s]:%hu",
+ pif_name(f->pif[INISIDE]),
+ inany_ntop(&ini->eaddr, estr, sizeof(estr)),
+ ini->eport,
+ inany_ntop(&ini->oaddr, fstr, sizeof(fstr)),
+ ini->oport);
+ }
+
+ if (tgtpif == PIF_NONE)
+ return NULL;
+
+ f->pif[TGTSIDE] = tgtpif;
+ flow_set_state(f, FLOW_STATE_TGT);
+ return tgt;
+}
+
+/**
+ * flow_set_type() - Set type and move to TYPED
+ * @flow: Flow to change state
+ * @pif: pif of the initiating side
*/
-union flow *flow_start(union flow *flow, enum flow_type type,
- unsigned iniside)
+union flow *flow_set_type(union flow *flow, enum flow_type type)
{
- (void)iniside;
- flow->f.type = type;
- flow_dbg(flow, "START %s", flow_type_str[flow->f.type]);
+ struct flow_common *f = &flow->f;
+
+ ASSERT(type != FLOW_TYPE_NONE);
+ ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_TGT);
+ ASSERT(f->type == FLOW_TYPE_NONE);
+ ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
+
+ f->type = type;
+ flow_set_state(f, FLOW_STATE_TYPED);
return flow;
}
/**
- * flow_end() - Clear flow type for finished flow and log
- * @flow: Flow to clear
+ * flow_activate() - Move flow to ACTIVE
+ * @f: Flow to change state
*/
-static void flow_end(union flow *flow)
+void flow_activate(struct flow_common *f)
{
- if (flow->f.type == FLOW_TYPE_NONE)
- return; /* Nothing to do */
+ ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED);
+ ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] != PIF_NONE);
- flow_dbg(flow, "END %s", flow_type_str[flow->f.type]);
- flow->f.type = FLOW_TYPE_NONE;
+ flow_set_state(f, FLOW_STATE_ACTIVE);
+ flow_new_entry = NULL;
}
/**
@@ -192,9 +499,12 @@ union flow *flow_alloc(void)
{
union flow *flow = &flowtab[flow_first_free];
+ ASSERT(!flow_new_entry);
+
if (flow_first_free >= FLOW_MAX)
return NULL;
+ ASSERT(flow->f.state == FLOW_STATE_FREE);
ASSERT(flow->f.type == FLOW_TYPE_NONE);
ASSERT(flow->free.n >= 1);
ASSERT(flow_first_free + flow->free.n <= FLOW_MAX);
@@ -217,7 +527,10 @@ union flow *flow_alloc(void)
flow_first_free = flow->free.next;
}
+ flow_new_entry = flow;
memset(flow, 0, sizeof(*flow));
+ flow_set_state(&flow->f, FLOW_STATE_NEW);
+
return flow;
}
@@ -229,15 +542,228 @@ union flow *flow_alloc(void)
*/
void flow_alloc_cancel(union flow *flow)
{
+ ASSERT(flow_new_entry == flow);
+ ASSERT(flow->f.state == FLOW_STATE_NEW ||
+ flow->f.state == FLOW_STATE_INI ||
+ flow->f.state == FLOW_STATE_TGT ||
+ flow->f.state == FLOW_STATE_TYPED);
ASSERT(flow_first_free > FLOW_IDX(flow));
- flow_end(flow);
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
+
/* Put it back in a length 1 free cluster, don't attempt to fully
* reverse flow_alloc()s steps. This will get folded together the next
* time flow_defer_handler runs anyway() */
flow->free.n = 1;
flow->free.next = flow_first_free;
flow_first_free = FLOW_IDX(flow);
+ flow_new_entry = NULL;
+}
+
+/**
+ * flow_hash() - Calculate hash value for one side of a flow
+ * @c: Execution context
+ * @proto: Protocol of this flow (IP L4 protocol number)
+ * @pif: pif of the side to hash
+ * @side: Flowside (must not have unspecified parts)
+ *
+ * Return: hash value
+ */
+static uint64_t flow_hash(const struct ctx *c, uint8_t proto, uint8_t pif,
+ const struct flowside *side)
+{
+ struct siphash_state state = SIPHASH_INIT(c->hash_secret);
+
+ inany_siphash_feed(&state, &side->oaddr);
+ inany_siphash_feed(&state, &side->eaddr);
+
+ return siphash_final(&state, 38, (uint64_t)proto << 40 |
+ (uint64_t)pif << 32 |
+ (uint64_t)side->oport << 16 |
+ (uint64_t)side->eport);
+}
+
+/**
+ * flow_sidx_hash() - Calculate hash value for given side of a given flow
+ * @c: Execution context
+ * @sidx: Flow & side index to get hash for
+ *
+ * Return: hash value, of the flow & side represented by @sidx
+ */
+static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
+{
+ const struct flow_common *f = &flow_at_sidx(sidx)->f;
+ const struct flowside *side = &f->side[sidx.sidei];
+ uint8_t pif = f->pif[sidx.sidei];
+
+ /* For the hash table to work, entries must have complete endpoint
+ * information, and at least a forwarding port.
+ */
+ ASSERT(pif != PIF_NONE && !inany_is_unspecified(&side->eaddr) &&
+ side->eport != 0 && side->oport != 0);
+
+ return flow_hash(c, FLOW_PROTO(f), pif, side);
+}
+
+/**
+ * flow_hash_probe_() - Find hash bucket for a flow, given hash
+ * @hash: Raw hash value for flow & side
+ * @sidx: Flow and side to find bucket for
+ *
+ * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * suitable free bucket for it.
+ */
+static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
+{
+ unsigned b = hash % FLOW_HASH_SIZE;
+
+ /* Linear probing */
+ while (flow_sidx_valid(flow_hashtab[b]) &&
+ !flow_sidx_eq(flow_hashtab[b], sidx))
+ b = mod_sub(b, 1, FLOW_HASH_SIZE);
+
+ return b;
+}
+
+/**
+ * flow_hash_probe() - Find hash bucket for a flow
+ * @c: Execution context
+ * @sidx: Flow and side to find bucket for
+ *
+ * Return: If @sidx is in the hash table, its current bucket, otherwise a
+ * suitable free bucket for it.
+ */
+static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
+{
+ return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx);
+}
+
+/**
+ * flow_hash_insert() - Insert side of a flow into into hash table
+ * @c: Execution context
+ * @sidx: Flow & side index
+ *
+ * Return: raw (un-modded) hash value of side of flow
+ */
+uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx)
+{
+ uint64_t hash = flow_sidx_hash(c, sidx);
+ unsigned b = flow_hash_probe_(hash, sidx);
+
+ flow_hashtab[b] = sidx;
+ flow_dbg(flow_at_sidx(sidx), "Side %u hash table insert: bucket: %u",
+ sidx.sidei, b);
+
+ return hash;
+}
+
+/**
+ * flow_hash_remove() - Drop side of a flow from the hash table
+ * @c: Execution context
+ * @sidx: Side of flow to remove
+ */
+void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx)
+{
+ unsigned b = flow_hash_probe(c, sidx), s;
+
+ if (!flow_sidx_valid(flow_hashtab[b]))
+ return; /* Redundant remove */
+
+ flow_dbg(flow_at_sidx(sidx), "Side %u hash table remove: bucket: %u",
+ sidx.sidei, b);
+
+ /* Scan the remainder of the cluster */
+ for (s = mod_sub(b, 1, FLOW_HASH_SIZE);
+ flow_sidx_valid(flow_hashtab[s]);
+ s = mod_sub(s, 1, FLOW_HASH_SIZE)) {
+ unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE;
+
+ if (!mod_between(h, s, b, FLOW_HASH_SIZE)) {
+ /* flow_hashtab[s] can live in flow_hashtab[b]'s slot */
+ debug("hash table remove: shuffle %u -> %u", s, b);
+ flow_hashtab[b] = flow_hashtab[s];
+ b = s;
+ }
+ }
+
+ flow_hashtab[b] = FLOW_SIDX_NONE;
+}
+
+/**
+ * flowside_lookup() - Look for a matching flowside in the flow table
+ * @c: Execution context
+ * @proto: Protocol of the flow (IP L4 protocol number)
+ * @pif: pif to look for in the table
+ * @side: Flowside to look for in the table
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto,
+ uint8_t pif, const struct flowside *side)
+{
+ flow_sidx_t sidx;
+ union flow *flow;
+ unsigned b;
+
+ b = flow_hash(c, proto, pif, side) % FLOW_HASH_SIZE;
+ while ((sidx = flow_hashtab[b], flow = flow_at_sidx(sidx)) &&
+ !(FLOW_PROTO(&flow->f) == proto &&
+ flow->f.pif[sidx.sidei] == pif &&
+ flowside_eq(&flow->f.side[sidx.sidei], side)))
+ b = mod_sub(b, 1, FLOW_HASH_SIZE);
+
+ return flow_hashtab[b];
+}
+
+/**
+ * flow_lookup_af() - Look up a flow given addressing information
+ * @c: Execution context
+ * @proto: Protocol of the flow (IP L4 protocol number)
+ * @pif: Interface of the flow
+ * @af: Address family, AF_INET or AF_INET6
+ * @eaddr: Guest side endpoint address (guest local address)
+ * @oaddr: Our guest side address (guest remote address)
+ * @eport: Guest side endpoint port (guest local port)
+ * @oport: Our guest side port (guest remote port)
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+flow_sidx_t flow_lookup_af(const struct ctx *c,
+ uint8_t proto, uint8_t pif, sa_family_t af,
+ const void *eaddr, const void *oaddr,
+ in_port_t eport, in_port_t oport)
+{
+ struct flowside side;
+
+ flowside_from_af(&side, af, eaddr, eport, oaddr, oport);
+ return flowside_lookup(c, proto, pif, &side);
+}
+
+/**
+ * flow_lookup_sa() - Look up a flow given an endpoint socket address
+ * @c: Execution context
+ * @proto: Protocol of the flow (IP L4 protocol number)
+ * @pif: Interface of the flow
+ * @esa: Socket address of the endpoint
+ * @oport: Our port number
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
+ const void *esa, in_port_t oport)
+{
+ struct flowside side = {
+ .oport = oport,
+ };
+
+ inany_from_sockaddr(&side.eaddr, &side.eport, esa);
+ if (inany_v4(&side.eaddr))
+ side.oaddr = inany_any4;
+ else
+ side.oaddr = inany_any6;
+
+ return flowside_lookup(c, proto, pif, &side);
}
/**
@@ -257,11 +783,14 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
flow_timer_run = *now;
}
+ ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
+
for (idx = 0; idx < FLOW_MAX; idx++) {
union flow *flow = &flowtab[idx];
bool closed = false;
- if (flow->f.type == FLOW_TYPE_NONE) {
+ switch (flow->f.state) {
+ case FLOW_STATE_FREE: {
unsigned skip = flow->free.n;
/* First entry of a free cluster must have n >= 1 */
@@ -283,17 +812,43 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
continue;
}
+ case FLOW_STATE_NEW:
+ case FLOW_STATE_INI:
+ case FLOW_STATE_TGT:
+ case FLOW_STATE_TYPED:
+ /* Incomplete flow at end of cycle */
+ ASSERT(false);
+ break;
+
+ case FLOW_STATE_ACTIVE:
+ /* Nothing to do */
+ break;
+
+ default:
+ ASSERT(false);
+ }
+
switch (flow->f.type) {
case FLOW_TYPE_NONE:
ASSERT(false);
break;
case FLOW_TCP:
- closed = tcp_flow_defer(flow);
+ closed = tcp_flow_defer(&flow->tcp);
break;
case FLOW_TCP_SPLICE:
- closed = tcp_splice_flow_defer(flow);
+ closed = tcp_splice_flow_defer(&flow->tcp_splice);
if (!closed && timer)
- tcp_splice_timer(c, flow);
+ tcp_splice_timer(c, &flow->tcp_splice);
+ break;
+ case FLOW_PING4:
+ case FLOW_PING6:
+ if (timer)
+ closed = icmp_ping_timer(c, &flow->ping, now);
+ break;
+ case FLOW_UDP:
+ closed = udp_flow_defer(&flow->udp);
+ if (!closed && timer)
+ closed = udp_flow_timer(c, &flow->udp, now);
break;
default:
/* Assume other flow types don't need any handling */
@@ -301,7 +856,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
}
if (closed) {
- flow_end(flow);
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
if (free_head) {
/* Add slot to current free cluster */
@@ -328,7 +884,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
*/
void flow_init(void)
{
+ unsigned b;
+
/* Initial state is a single free cluster containing the whole table */
flowtab[0].free.n = FLOW_MAX;
flowtab[0].free.next = FLOW_MAX;
+
+ for (b = 0; b < FLOW_HASH_SIZE; b++)
+ flow_hashtab[b] = FLOW_SIDX_NONE;
}