aboutgitcodebugslistschat
path: root/tcp_splice.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp_splice.c')
-rw-r--r--tcp_splice.c297
1 files changed, 123 insertions, 174 deletions
diff --git a/tcp_splice.c b/tcp_splice.c
index d066112..93f8bce 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -28,7 +28,7 @@
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
*
- * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64
+ * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64
*/
#include <sched.h>
@@ -73,10 +73,7 @@ static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE];
/* Pool of pre-opened pipes */
static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2];
-#define CONN_V6(x) (x->flags & SPLICE_V6)
-#define CONN_V4(x) (!CONN_V6(x))
-#define CONN_HAS(conn, set) ((conn->events & (set)) == (set))
-#define CONN(idx) (&FLOW(idx)->tcp_splice)
+#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
/* Display strings for connection events */
static const char *tcp_splice_event_str[] __attribute((__unused__)) = {
@@ -95,6 +92,24 @@ static int tcp_sock_refill_ns(void *arg);
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
/**
+ * conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
+ * @sidx: Flow and side to retrieve
+ *
+ * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
+ * Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
+ */
+static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
+{
+ union flow *flow = flow_at_sidx(sidx);
+
+ if (!flow)
+ return NULL;
+
+ ASSERT(flow->f.type == FLOW_TCP_SPLICE);
+ return &flow->tcp_splice;
+}
+
+/**
* tcp_splice_conn_epoll_events() - epoll events masks for given state
* @events: Connection event flags
* @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket
@@ -102,19 +117,22 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
static void tcp_splice_conn_epoll_events(uint16_t events,
struct epoll_event ev[])
{
- ev[0].events = ev[1].events = 0;
+ unsigned sidei;
+
+ flow_foreach_sidei(sidei)
+ ev[sidei].events = 0;
if (events & SPLICE_ESTABLISHED) {
- if (!(events & FIN_SENT_1))
- ev[0].events = EPOLLIN | EPOLLRDHUP;
- if (!(events & FIN_SENT_0))
- ev[1].events = EPOLLIN | EPOLLRDHUP;
+ flow_foreach_sidei(sidei) {
+ if (!(events & FIN_SENT(!sidei)))
+ ev[sidei].events = EPOLLIN | EPOLLRDHUP;
+ }
} else if (events & SPLICE_CONNECT) {
ev[1].events = EPOLLOUT;
}
- ev[0].events |= (events & OUT_WAIT_0) ? EPOLLOUT : 0;
- ev[1].events |= (events & OUT_WAIT_1) ? EPOLLOUT : 0;
+ flow_foreach_sidei(sidei)
+ ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
}
/**
@@ -235,32 +253,31 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
/**
* tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed)
- * @flow: Flow table entry for this connection
+ * @conn: Connection entry to handle
*
* Return: true if the flow is ready to free, false otherwise
*/
-bool tcp_splice_flow_defer(union flow *flow)
+bool tcp_splice_flow_defer(struct tcp_splice_conn *conn)
{
- struct tcp_splice_conn *conn = &flow->tcp_splice;
- unsigned side;
+ unsigned sidei;
- if (!(flow->tcp_splice.flags & CLOSING))
+ if (!(conn->flags & CLOSING))
return false;
- for (side = 0; side < SIDES; side++) {
+ flow_foreach_sidei(sidei) {
/* Flushing might need to block: don't recycle them. */
- if (conn->pipe[side][0] >= 0) {
- close(conn->pipe[side][0]);
- close(conn->pipe[side][1]);
- conn->pipe[side][0] = conn->pipe[side][1] = -1;
+ if (conn->pipe[sidei][0] >= 0) {
+ close(conn->pipe[sidei][0]);
+ close(conn->pipe[sidei][1]);
+ conn->pipe[sidei][0] = conn->pipe[sidei][1] = -1;
}
- if (conn->s[side] >= 0) {
- close(conn->s[side]);
- conn->s[side] = -1;
+ if (conn->s[sidei] >= 0) {
+ close(conn->s[sidei]);
+ conn->s[sidei] = -1;
}
- conn->read[side] = conn->written[side] = 0;
+ conn->read[sidei] = conn->written[sidei] = 0;
}
conn->events = SPLICE_CLOSED;
@@ -280,33 +297,33 @@ bool tcp_splice_flow_defer(union flow *flow)
static int tcp_splice_connect_finish(const struct ctx *c,
struct tcp_splice_conn *conn)
{
- unsigned side;
+ unsigned sidei;
int i = 0;
- for (side = 0; side < SIDES; side++) {
+ flow_foreach_sidei(sidei) {
for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) {
if (splice_pipe_pool[i][0] >= 0) {
- SWAP(conn->pipe[side][0],
+ SWAP(conn->pipe[sidei][0],
splice_pipe_pool[i][0]);
- SWAP(conn->pipe[side][1],
+ SWAP(conn->pipe[sidei][1],
splice_pipe_pool[i][1]);
break;
}
}
- if (conn->pipe[side][0] < 0) {
- if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) {
+ if (conn->pipe[sidei][0] < 0) {
+ if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
flow_err(conn, "cannot create %d->%d pipe: %s",
- side, !side, strerror(errno));
+ sidei, !sidei, strerror(errno));
conn_flag(c, conn, CLOSING);
return -EIO;
}
- if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ,
- c->tcp.pipe_size)) {
+ if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
+ c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
flow_trace(conn,
"cannot set %d->%d pipe size to %zu",
- side, !side, c->tcp.pipe_size);
+ sidei, !sidei, c->tcp.pipe_size);
}
}
}
@@ -321,31 +338,20 @@ static int tcp_splice_connect_finish(const struct ctx *c,
* tcp_splice_connect() - Create and connect socket for new spliced connection
* @c: Execution context
* @conn: Connection pointer
- * @af: Address family
- * @pif: pif on which to create socket
- * @port: Destination port, host order
*
* Return: 0 for connect() succeeded or in progress, negative value on error
*/
-static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
- sa_family_t af, uint8_t pif, in_port_t port)
+static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
{
- struct sockaddr_in6 addr6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htons(port),
- .sin6_addr = IN6ADDR_LOOPBACK_INIT,
- };
- struct sockaddr_in addr4 = {
- .sin_family = AF_INET,
- .sin_port = htons(port),
- .sin_addr = IN4ADDR_LOOPBACK_INIT,
- };
- const struct sockaddr *sa;
+ const struct flowside *tgt = &conn->f.side[TGTSIDE];
+ sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6;
+ uint8_t tgtpif = conn->f.pif[TGTSIDE];
+ union sockaddr_inany sa;
socklen_t sl;
- if (pif == PIF_HOST)
+ if (tgtpif == PIF_HOST)
conn->s[1] = tcp_conn_sock(c, af);
- else if (pif == PIF_SPLICE)
+ else if (tgtpif == PIF_SPLICE)
conn->s[1] = tcp_conn_sock_ns(c, af);
else
ASSERT(0);
@@ -359,15 +365,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn,
conn->s[1]);
}
- if (CONN_V6(conn)) {
- sa = (struct sockaddr *)&addr6;
- sl = sizeof(addr6);
- } else {
- sa = (struct sockaddr *)&addr4;
- sl = sizeof(addr4);
- }
+ pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
- if (connect(conn->s[1], sa, sl)) {
+ if (connect(conn->s[1], &sa.sa, sl)) {
if (errno != EINPROGRESS) {
flow_trace(conn, "Couldn't connect socket for splice: %s",
strerror(errno));
@@ -414,67 +414,19 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
/**
* tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection
* @c: Execution context
- * @pif0: pif id of side 0
- * @dstport: Side 0 destination port of connection
* @flow: flow to initialise
* @s0: Accepted (side 0) socket
* @sa: Peer address of connection
*
- * Return: true if able to create a spliced connection, false otherwise
* #syscalls:pasta setsockopt
*/
-bool tcp_splice_conn_from_sock(const struct ctx *c,
- uint8_t pif0, in_port_t dstport,
- union flow *flow, int s0,
- const union sockaddr_inany *sa)
+void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
{
- struct tcp_splice_conn *conn;
- union inany_addr src;
- in_port_t srcport;
- sa_family_t af;
- uint8_t pif1;
+ struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE,
+ tcp_splice);
- if (c->mode != MODE_PASTA)
- return false;
+ ASSERT(c->mode == MODE_PASTA);
- inany_from_sockaddr(&src, &srcport, sa);
- af = inany_v4(&src) ? AF_INET : AF_INET6;
-
- switch (pif0) {
- case PIF_SPLICE:
- if (!inany_is_loopback(&src)) {
- char str[INANY_ADDRSTRLEN];
-
- /* We can't use flow_err() etc. because we haven't set
- * the flow type yet
- */
- warn("Bad source address %s for splice, closing",
- inany_ntop(&src, str, sizeof(str)));
-
- /* We *don't* want to fall back to tap */
- flow_alloc_cancel(flow);
- return true;
- }
-
- pif1 = PIF_HOST;
- dstport += c->tcp.fwd_out.delta[dstport];
- break;
-
- case PIF_HOST:
- if (!inany_is_loopback(&src))
- return false;
-
- pif1 = PIF_SPLICE;
- dstport += c->tcp.fwd_in.delta[dstport];
- break;
-
- default:
- return false;
- }
-
- conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
-
- conn->flags = af == AF_INET ? 0 : SPLICE_V6;
conn->s[0] = s0;
conn->s[1] = -1;
conn->pipe[0][0] = conn->pipe[0][1] = -1;
@@ -483,10 +435,10 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)))
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
- if (tcp_splice_connect(c, conn, af, pif1, dstport))
+ if (tcp_splice_connect(c, conn))
conn_flag(c, conn, CLOSING);
- return true;
+ FLOW_ACTIVATE(conn);
}
/**
@@ -500,8 +452,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
uint32_t events)
{
- struct tcp_splice_conn *conn = CONN(ref.flowside.flow);
- unsigned side = ref.flowside.side, fromside;
+ struct tcp_splice_conn *conn = conn_at_sidx(ref.flowside);
+ unsigned evsidei = ref.flowside.sidei, fromsidei;
uint8_t lowat_set_flag, lowat_act_flag;
int eof, never_read;
@@ -533,30 +485,31 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
}
if (events & EPOLLOUT) {
- fromside = !side;
- conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1);
+ fromsidei = !evsidei;
+ conn_event(c, conn, ~OUT_WAIT(evsidei));
} else {
- fromside = side;
+ fromsidei = evsidei;
}
if (events & EPOLLRDHUP)
/* For side 0 this is fake, but implied */
- conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1);
+ conn_event(c, conn, FIN_RCVD(evsidei));
swap:
eof = 0;
never_read = 1;
- lowat_set_flag = fromside == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
- lowat_act_flag = fromside == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;
+ lowat_set_flag = RCVLOWAT_SET(fromsidei);
+ lowat_act_flag = RCVLOWAT_ACT(fromsidei);
while (1) {
- ssize_t readlen, to_write = 0, written;
+ ssize_t readlen, written, pending;
int more = 0;
retry:
- readlen = splice(conn->s[fromside], NULL,
- conn->pipe[fromside][1], NULL, c->tcp.pipe_size,
+ readlen = splice(conn->s[fromsidei], NULL,
+ conn->pipe[fromsidei][1], NULL,
+ c->tcp.pipe_size,
SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
flow_trace(conn, "%zi from read-side call", readlen);
if (readlen < 0) {
@@ -565,14 +518,11 @@ retry:
if (errno != EAGAIN)
goto close;
-
- to_write = c->tcp.pipe_size;
} else if (!readlen) {
eof = 1;
- to_write = c->tcp.pipe_size;
} else {
never_read = 0;
- to_write += readlen;
+
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
more = SPLICE_F_MORE;
@@ -581,11 +531,11 @@ retry:
}
eintr:
- written = splice(conn->pipe[fromside][0], NULL,
- conn->s[!fromside], NULL, to_write,
+ written = splice(conn->pipe[fromsidei][0], NULL,
+ conn->s[!fromsidei], NULL, c->tcp.pipe_size,
SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
flow_trace(conn, "%zi from write-side call (passed %zi)",
- written, to_write);
+ written, c->tcp.pipe_size);
/* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) {
@@ -596,18 +546,23 @@ eintr:
readlen > (long)c->tcp.pipe_size / 10) {
int lowat = c->tcp.pipe_size / 4;
- setsockopt(conn->s[fromside], SOL_SOCKET,
- SO_RCVLOWAT, &lowat, sizeof(lowat));
-
- conn_flag(c, conn, lowat_set_flag);
- conn_flag(c, conn, lowat_act_flag);
+ if (setsockopt(conn->s[fromsidei], SOL_SOCKET,
+ SO_RCVLOWAT,
+ &lowat, sizeof(lowat))) {
+ flow_trace(conn,
+ "Setting SO_RCVLOWAT %i: %s",
+ lowat, strerror(errno));
+ } else {
+ conn_flag(c, conn, lowat_set_flag);
+ conn_flag(c, conn, lowat_act_flag);
+ }
}
break;
}
- conn->read[fromside] += readlen > 0 ? readlen : 0;
- conn->written[fromside] += written > 0 ? written : 0;
+ conn->read[fromsidei] += readlen > 0 ? readlen : 0;
+ conn->written[fromsidei] += written > 0 ? written : 0;
if (written < 0) {
if (errno == EINTR)
@@ -616,47 +571,43 @@ eintr:
if (errno != EAGAIN)
goto close;
- if (never_read)
+ if (conn->read[fromsidei] == conn->written[fromsidei])
break;
- conn_event(c, conn,
- fromside == 0 ? OUT_WAIT_1 : OUT_WAIT_0);
+ conn_event(c, conn, OUT_WAIT(!fromsidei));
break;
}
if (never_read && written == (long)(c->tcp.pipe_size))
goto retry;
- if (!never_read && written < to_write) {
- to_write -= written;
+ pending = conn->read[fromsidei] - conn->written[fromsidei];
+ if (!never_read && written > 0 && written < pending)
goto retry;
- }
if (eof)
break;
}
- if ((conn->events & FIN_RCVD_0) && !(conn->events & FIN_SENT_1)) {
- if (conn->read[fromside] == conn->written[fromside] && eof) {
- shutdown(conn->s[1], SHUT_WR);
- conn_event(c, conn, FIN_SENT_1);
- }
- }
+ if (conn->read[fromsidei] == conn->written[fromsidei] && eof) {
+ unsigned sidei;
- if ((conn->events & FIN_RCVD_1) && !(conn->events & FIN_SENT_0)) {
- if (conn->read[fromside] == conn->written[fromside] && eof) {
- shutdown(conn->s[0], SHUT_WR);
- conn_event(c, conn, FIN_SENT_0);
+ flow_foreach_sidei(sidei) {
+ if ((conn->events & FIN_RCVD(sidei)) &&
+ !(conn->events & FIN_SENT(!sidei))) {
+ shutdown(conn->s[!sidei], SHUT_WR);
+ conn_event(c, conn, FIN_SENT(!sidei));
+ }
}
}
- if (CONN_HAS(conn, FIN_SENT_0 | FIN_SENT_1))
+ if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1)))
goto close;
if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) {
events = EPOLLIN;
- fromside = !fromside;
+ fromsidei = !fromsidei;
goto swap;
}
@@ -721,7 +672,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
continue;
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
- c->tcp.pipe_size)) {
+ c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
trace("TCP (spliced): cannot set pool pipe size to %zu",
c->tcp.pipe_size);
}
@@ -734,6 +685,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
*
* Return: 0
*/
+/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */
static int tcp_sock_refill_ns(void *arg)
{
const struct ctx *c = (const struct ctx *)arg;
@@ -786,29 +738,26 @@ void tcp_splice_init(struct ctx *c)
/**
* tcp_splice_timer() - Timer for spliced connections
* @c: Execution context
- * @flow: Flow table entry
+ * @conn: Connection to handle
*/
-void tcp_splice_timer(const struct ctx *c, union flow *flow)
+void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
{
- struct tcp_splice_conn *conn = &flow->tcp_splice;
- int side;
+ unsigned sidei;
ASSERT(!(conn->flags & CLOSING));
- for (side = 0; side < SIDES; side++) {
- uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1;
- uint8_t act = side == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1;
-
- if ((conn->flags & set) && !(conn->flags & act)) {
- if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT,
+ flow_foreach_sidei(sidei) {
+ if ((conn->flags & RCVLOWAT_SET(sidei)) &&
+ !(conn->flags & RCVLOWAT_ACT(sidei))) {
+ if (setsockopt(conn->s[sidei], SOL_SOCKET, SO_RCVLOWAT,
&((int){ 1 }), sizeof(int))) {
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
- conn->s[side]);
+ conn->s[sidei]);
}
- conn_flag(c, conn, ~set);
+ conn_flag(c, conn, ~RCVLOWAT_SET(sidei));
}
}
- conn_flag(c, conn, ~RCVLOWAT_ACT_0);
- conn_flag(c, conn, ~RCVLOWAT_ACT_1);
+ flow_foreach_sidei(sidei)
+ conn_flag(c, conn, ~RCVLOWAT_ACT(sidei));
}