diff options
Diffstat (limited to 'tcp_splice.c')
-rw-r--r-- | tcp_splice.c | 297 |
1 files changed, 123 insertions, 174 deletions
diff --git a/tcp_splice.c b/tcp_splice.c index d066112..93f8bce 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -28,7 +28,7 @@ * - FIN_SENT_0: FIN (write shutdown) sent to accepted socket * - FIN_SENT_1: FIN (write shutdown) sent to target socket * - * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64 + * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 i686:fcntl64 */ #include <sched.h> @@ -73,10 +73,7 @@ static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; /* Pool of pre-opened pipes */ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2]; -#define CONN_V6(x) (x->flags & SPLICE_V6) -#define CONN_V4(x) (!CONN_V6(x)) -#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) -#define CONN(idx) (&FLOW(idx)->tcp_splice) +#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set)) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -95,6 +92,24 @@ static int tcp_sock_refill_ns(void *arg); static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); /** + * conn_at_sidx() - Get spliced TCP connection specific flow at given sidx + * @sidx: Flow and side to retrieve + * + * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid. + * Asserts if the flow at @sidx is not FLOW_TCP_SPLICE. + */ +static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) +{ + union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return NULL; + + ASSERT(flow->f.type == FLOW_TCP_SPLICE); + return &flow->tcp_splice; +} + +/** * tcp_splice_conn_epoll_events() - epoll events masks for given state * @events: Connection event flags * @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket @@ -102,19 +117,22 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); static void tcp_splice_conn_epoll_events(uint16_t events, struct epoll_event ev[]) { - ev[0].events = ev[1].events = 0; + unsigned sidei; + + flow_foreach_sidei(sidei) + ev[sidei].events = 0; if (events & SPLICE_ESTABLISHED) { - if (!(events & FIN_SENT_1)) - ev[0].events = EPOLLIN | EPOLLRDHUP; - if (!(events & FIN_SENT_0)) - ev[1].events = EPOLLIN | EPOLLRDHUP; + flow_foreach_sidei(sidei) { + if (!(events & FIN_SENT(!sidei))) + ev[sidei].events = EPOLLIN | EPOLLRDHUP; + } } else if (events & SPLICE_CONNECT) { ev[1].events = EPOLLOUT; } - ev[0].events |= (events & OUT_WAIT_0) ? EPOLLOUT : 0; - ev[1].events |= (events & OUT_WAIT_1) ? EPOLLOUT : 0; + flow_foreach_sidei(sidei) + ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0; } /** @@ -235,32 +253,31 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, /** * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) - * @flow: Flow table entry for this connection + * @conn: Connection entry to handle * * Return: true if the flow is ready to free, false otherwise */ -bool tcp_splice_flow_defer(union flow *flow) +bool tcp_splice_flow_defer(struct tcp_splice_conn *conn) { - struct tcp_splice_conn *conn = &flow->tcp_splice; - unsigned side; + unsigned sidei; - if (!(flow->tcp_splice.flags & CLOSING)) + if (!(conn->flags & CLOSING)) return false; - for (side = 0; side < SIDES; side++) { + flow_foreach_sidei(sidei) { /* Flushing might need to block: don't recycle them. */ - if (conn->pipe[side][0] >= 0) { - close(conn->pipe[side][0]); - close(conn->pipe[side][1]); - conn->pipe[side][0] = conn->pipe[side][1] = -1; + if (conn->pipe[sidei][0] >= 0) { + close(conn->pipe[sidei][0]); + close(conn->pipe[sidei][1]); + conn->pipe[sidei][0] = conn->pipe[sidei][1] = -1; } - if (conn->s[side] >= 0) { - close(conn->s[side]); - conn->s[side] = -1; + if (conn->s[sidei] >= 0) { + close(conn->s[sidei]); + conn->s[sidei] = -1; } - conn->read[side] = conn->written[side] = 0; + conn->read[sidei] = conn->written[sidei] = 0; } conn->events = SPLICE_CLOSED; @@ -280,33 +297,33 @@ bool tcp_splice_flow_defer(union flow *flow) static int tcp_splice_connect_finish(const struct ctx *c, struct tcp_splice_conn *conn) { - unsigned side; + unsigned sidei; int i = 0; - for (side = 0; side < SIDES; side++) { + flow_foreach_sidei(sidei) { for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { if (splice_pipe_pool[i][0] >= 0) { - SWAP(conn->pipe[side][0], + SWAP(conn->pipe[sidei][0], splice_pipe_pool[i][0]); - SWAP(conn->pipe[side][1], + SWAP(conn->pipe[sidei][1], splice_pipe_pool[i][1]); break; } } - if (conn->pipe[side][0] < 0) { - if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) { + if (conn->pipe[sidei][0] < 0) { + if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { flow_err(conn, "cannot create %d->%d pipe: %s", - side, !side, strerror(errno)); + sidei, !sidei, strerror(errno)); conn_flag(c, conn, CLOSING); return -EIO; } - if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ, + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { flow_trace(conn, "cannot set %d->%d pipe size to %zu", - side, !side, c->tcp.pipe_size); + sidei, !sidei, c->tcp.pipe_size); } } } @@ -321,31 +338,20 @@ static int tcp_splice_connect_finish(const struct ctx *c, * tcp_splice_connect() - Create and connect socket for new spliced connection * @c: Execution context * @conn: Connection pointer - * @af: Address family - * @pif: pif on which to create socket - * @port: Destination port, host order * * Return: 0 for connect() succeeded or in progress, negative value on error */ -static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, - sa_family_t af, uint8_t pif, in_port_t port) +static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - .sin_addr = IN4ADDR_LOOPBACK_INIT, - }; - const struct sockaddr *sa; + const struct flowside *tgt = &conn->f.side[TGTSIDE]; + sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6; + uint8_t tgtpif = conn->f.pif[TGTSIDE]; + union sockaddr_inany sa; socklen_t sl; - if (pif == PIF_HOST) + if (tgtpif == PIF_HOST) conn->s[1] = tcp_conn_sock(c, af); - else if (pif == PIF_SPLICE) + else if (tgtpif == PIF_SPLICE) conn->s[1] = tcp_conn_sock_ns(c, af); else ASSERT(0); @@ -359,15 +365,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, conn->s[1]); } - if (CONN_V6(conn)) { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } else { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } + pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport); - if (connect(conn->s[1], sa, sl)) { + if (connect(conn->s[1], &sa.sa, sl)) { if (errno != EINPROGRESS) { flow_trace(conn, "Couldn't connect socket for splice: %s", strerror(errno)); @@ -414,67 +414,19 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) /** * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection * @c: Execution context - * @pif0: pif id of side 0 - * @dstport: Side 0 destination port of connection * @flow: flow to initialise * @s0: Accepted (side 0) socket * @sa: Peer address of connection * - * Return: true if able to create a spliced connection, false otherwise * #syscalls:pasta setsockopt */ -bool tcp_splice_conn_from_sock(const struct ctx *c, - uint8_t pif0, in_port_t dstport, - union flow *flow, int s0, - const union sockaddr_inany *sa) +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) { - struct tcp_splice_conn *conn; - union inany_addr src; - in_port_t srcport; - sa_family_t af; - uint8_t pif1; + struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, + tcp_splice); - if (c->mode != MODE_PASTA) - return false; + ASSERT(c->mode == MODE_PASTA); - inany_from_sockaddr(&src, &srcport, sa); - af = inany_v4(&src) ? AF_INET : AF_INET6; - - switch (pif0) { - case PIF_SPLICE: - if (!inany_is_loopback(&src)) { - char str[INANY_ADDRSTRLEN]; - - /* We can't use flow_err() etc. because we haven't set - * the flow type yet - */ - warn("Bad source address %s for splice, closing", - inany_ntop(&src, str, sizeof(str))); - - /* We *don't* want to fall back to tap */ - flow_alloc_cancel(flow); - return true; - } - - pif1 = PIF_HOST; - dstport += c->tcp.fwd_out.delta[dstport]; - break; - - case PIF_HOST: - if (!inany_is_loopback(&src)) - return false; - - pif1 = PIF_SPLICE; - dstport += c->tcp.fwd_in.delta[dstport]; - break; - - default: - return false; - } - - conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0); - - conn->flags = af == AF_INET ? 0 : SPLICE_V6; conn->s[0] = s0; conn->s[1] = -1; conn->pipe[0][0] = conn->pipe[0][1] = -1; @@ -483,10 +435,10 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); - if (tcp_splice_connect(c, conn, af, pif1, dstport)) + if (tcp_splice_connect(c, conn)) conn_flag(c, conn, CLOSING); - return true; + FLOW_ACTIVATE(conn); } /** @@ -500,8 +452,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { - struct tcp_splice_conn *conn = CONN(ref.flowside.flow); - unsigned side = ref.flowside.side, fromside; + struct tcp_splice_conn *conn = conn_at_sidx(ref.flowside); + unsigned evsidei = ref.flowside.sidei, fromsidei; uint8_t lowat_set_flag, lowat_act_flag; int eof, never_read; @@ -533,30 +485,31 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, } if (events & EPOLLOUT) { - fromside = !side; - conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1); + fromsidei = !evsidei; + conn_event(c, conn, ~OUT_WAIT(evsidei)); } else { - fromside = side; + fromsidei = evsidei; } if (events & EPOLLRDHUP) /* For side 0 this is fake, but implied */ - conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1); + conn_event(c, conn, FIN_RCVD(evsidei)); swap: eof = 0; never_read = 1; - lowat_set_flag = fromside == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1; - lowat_act_flag = fromside == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1; + lowat_set_flag = RCVLOWAT_SET(fromsidei); + lowat_act_flag = RCVLOWAT_ACT(fromsidei); while (1) { - ssize_t readlen, to_write = 0, written; + ssize_t readlen, written, pending; int more = 0; retry: - readlen = splice(conn->s[fromside], NULL, - conn->pipe[fromside][1], NULL, c->tcp.pipe_size, + readlen = splice(conn->s[fromsidei], NULL, + conn->pipe[fromsidei][1], NULL, + c->tcp.pipe_size, SPLICE_F_MOVE | SPLICE_F_NONBLOCK); flow_trace(conn, "%zi from read-side call", readlen); if (readlen < 0) { @@ -565,14 +518,11 @@ retry: if (errno != EAGAIN) goto close; - - to_write = c->tcp.pipe_size; } else if (!readlen) { eof = 1; - to_write = c->tcp.pipe_size; } else { never_read = 0; - to_write += readlen; + if (readlen >= (long)c->tcp.pipe_size * 90 / 100) more = SPLICE_F_MORE; @@ -581,11 +531,11 @@ retry: } eintr: - written = splice(conn->pipe[fromside][0], NULL, - conn->s[!fromside], NULL, to_write, + written = splice(conn->pipe[fromsidei][0], NULL, + conn->s[!fromsidei], NULL, c->tcp.pipe_size, SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); flow_trace(conn, "%zi from write-side call (passed %zi)", - written, to_write); + written, c->tcp.pipe_size); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { @@ -596,18 +546,23 @@ eintr: readlen > (long)c->tcp.pipe_size / 10) { int lowat = c->tcp.pipe_size / 4; - setsockopt(conn->s[fromside], SOL_SOCKET, - SO_RCVLOWAT, &lowat, sizeof(lowat)); - - conn_flag(c, conn, lowat_set_flag); - conn_flag(c, conn, lowat_act_flag); + if (setsockopt(conn->s[fromsidei], SOL_SOCKET, + SO_RCVLOWAT, + &lowat, sizeof(lowat))) { + flow_trace(conn, + "Setting SO_RCVLOWAT %i: %s", + lowat, strerror(errno)); + } else { + conn_flag(c, conn, lowat_set_flag); + conn_flag(c, conn, lowat_act_flag); + } } break; } - conn->read[fromside] += readlen > 0 ? readlen : 0; - conn->written[fromside] += written > 0 ? written : 0; + conn->read[fromsidei] += readlen > 0 ? readlen : 0; + conn->written[fromsidei] += written > 0 ? written : 0; if (written < 0) { if (errno == EINTR) @@ -616,47 +571,43 @@ eintr: if (errno != EAGAIN) goto close; - if (never_read) + if (conn->read[fromsidei] == conn->written[fromsidei]) break; - conn_event(c, conn, - fromside == 0 ? OUT_WAIT_1 : OUT_WAIT_0); + conn_event(c, conn, OUT_WAIT(!fromsidei)); break; } if (never_read && written == (long)(c->tcp.pipe_size)) goto retry; - if (!never_read && written < to_write) { - to_write -= written; + pending = conn->read[fromsidei] - conn->written[fromsidei]; + if (!never_read && written > 0 && written < pending) goto retry; - } if (eof) break; } - if ((conn->events & FIN_RCVD_0) && !(conn->events & FIN_SENT_1)) { - if (conn->read[fromside] == conn->written[fromside] && eof) { - shutdown(conn->s[1], SHUT_WR); - conn_event(c, conn, FIN_SENT_1); - } - } + if (conn->read[fromsidei] == conn->written[fromsidei] && eof) { + unsigned sidei; - if ((conn->events & FIN_RCVD_1) && !(conn->events & FIN_SENT_0)) { - if (conn->read[fromside] == conn->written[fromside] && eof) { - shutdown(conn->s[0], SHUT_WR); - conn_event(c, conn, FIN_SENT_0); + flow_foreach_sidei(sidei) { + if ((conn->events & FIN_RCVD(sidei)) && + !(conn->events & FIN_SENT(!sidei))) { + shutdown(conn->s[!sidei], SHUT_WR); + conn_event(c, conn, FIN_SENT(!sidei)); + } } } - if (CONN_HAS(conn, FIN_SENT_0 | FIN_SENT_1)) + if (CONN_HAS(conn, FIN_SENT(0) | FIN_SENT(1))) goto close; if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { events = EPOLLIN; - fromside = !fromside; + fromsidei = !fromsidei; goto swap; } @@ -721,7 +672,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c) continue; if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { trace("TCP (spliced): cannot set pool pipe size to %zu", c->tcp.pipe_size); } @@ -734,6 +685,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c) * * Return: 0 */ +/* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ static int tcp_sock_refill_ns(void *arg) { const struct ctx *c = (const struct ctx *)arg; @@ -786,29 +738,26 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections * @c: Execution context - * @flow: Flow table entry + * @conn: Connection to handle */ -void tcp_splice_timer(const struct ctx *c, union flow *flow) +void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) { - struct tcp_splice_conn *conn = &flow->tcp_splice; - int side; + unsigned sidei; ASSERT(!(conn->flags & CLOSING)); - for (side = 0; side < SIDES; side++) { - uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1; - uint8_t act = side == 0 ? RCVLOWAT_ACT_0 : RCVLOWAT_ACT_1; - - if ((conn->flags & set) && !(conn->flags & act)) { - if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT, + flow_foreach_sidei(sidei) { + if ((conn->flags & RCVLOWAT_SET(sidei)) && + !(conn->flags & RCVLOWAT_ACT(sidei))) { + if (setsockopt(conn->s[sidei], SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int))) { flow_trace(conn, "can't set SO_RCVLOWAT on %d", - conn->s[side]); + conn->s[sidei]); } - conn_flag(c, conn, ~set); + conn_flag(c, conn, ~RCVLOWAT_SET(sidei)); } } - conn_flag(c, conn, ~RCVLOWAT_ACT_0); - conn_flag(c, conn, ~RCVLOWAT_ACT_1); + flow_foreach_sidei(sidei) + conn_flag(c, conn, ~RCVLOWAT_ACT(sidei)); } |