diff options
Diffstat (limited to 'tcp_splice.c')
| -rw-r--r-- | tcp_splice.c | 234 |
1 files changed, 115 insertions, 119 deletions
diff --git a/tcp_splice.c b/tcp_splice.c index 483e45d..a7c04ca 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -28,7 +28,7 @@ * - FIN_SENT_0: FIN (write shutdown) sent to accepted socket * - FIN_SENT_1: FIN (write shutdown) sent to target socket * - * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64 + * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64 */ #include <sched.h> @@ -44,7 +44,6 @@ #include <net/ethernet.h> #include <netinet/in.h> #include <netinet/tcp.h> -#include <sys/epoll.h> #include <sys/types.h> #include <sys/socket.h> @@ -56,6 +55,7 @@ #include "siphash.h" #include "inany.h" #include "flow.h" +#include "epoll_ctl.h" #include "flow_table.h" @@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); * conn_at_sidx() - Get spliced TCP connection specific flow at given sidx * @sidx: Flow and side to retrieve * - * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid. + * Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid. * Asserts if the flow at @sidx is not FLOW_TCP_SPLICE. */ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) @@ -114,68 +114,62 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx) * @events: Connection event flags * @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket */ -static void tcp_splice_conn_epoll_events(uint16_t events, - struct epoll_event ev[]) +static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) { - unsigned sidei; - - flow_foreach_sidei(sidei) - ev[sidei].events = 0; + uint32_t e = 0; if (events & SPLICE_ESTABLISHED) { - flow_foreach_sidei(sidei) { - if (!(events & FIN_SENT(!sidei))) - ev[sidei].events = EPOLLIN | EPOLLRDHUP; - } - } else if (events & SPLICE_CONNECT) { - ev[1].events = EPOLLOUT; + if (!(events & FIN_SENT(!sidei))) + e = EPOLLIN | EPOLLRDHUP; + } else if (sidei == 1 && events & SPLICE_CONNECT) { + e = EPOLLOUT; } - flow_foreach_sidei(sidei) - ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0; + if (events & OUT_WAIT(sidei)) + e |= EPOLLOUT; + if (events & OUT_WAIT(!sidei)) + e &= ~EPOLLIN; + + return e; } /** * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events - * @c: Execution context * @conn: Connection pointer * * Return: 0 on success, negative error code on failure (not on deletion) */ -static int tcp_splice_epoll_ctl(const struct ctx *c, - struct tcp_splice_conn *conn) +static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn) { - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - const union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], - .flowside = FLOW_SIDX(conn, 0) }, - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], - .flowside = FLOW_SIDX(conn, 1) } - }; - struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, - { .data.u64 = ref[1].u64 } }; - - tcp_splice_conn_epoll_events(conn->events, ev); - - if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || - epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { + uint32_t events[2]; + int m; + + if (flow_in_epoll(&conn->f)) { + m = EPOLL_CTL_MOD; + } else { + flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); + m = EPOLL_CTL_ADD; + } + + events[0] = tcp_splice_conn_epoll_events(conn->events, 0); + events[1] = tcp_splice_conn_epoll_events(conn->events, 1); + + if (flow_epoll_set(&conn->f, m, events[0], conn->s[0], 0) || + flow_epoll_set(&conn->f, m, events[1], conn->s[1], 1)) { int ret = -errno; - flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno)); + flow_perror(conn, "ERROR on epoll_ctl()"); return ret; } - conn->in_epoll = true; - return 0; } /** * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag - * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, +static void conn_flag_do(struct tcp_splice_conn *conn, unsigned long flag) { if (flag & (flag - 1)) { @@ -200,25 +194,23 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, } if (flag == CLOSING) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL); - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL); + epoll_del(flow_epollfd(&conn->f), conn->s[0]); + epoll_del(flow_epollfd(&conn->f), conn->s[1]); } } -#define conn_flag(c, conn, flag) \ +#define conn_flag(conn, flag) \ do { \ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ + conn_flag_do(conn, flag); \ } while (0) /** * conn_event_do() - Set and log connection events, update epoll state - * @c: Execution context * @conn: Connection pointer * @event: Connection event */ -static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, - unsigned long event) +static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event) { if (event & (event - 1)) { int flag_index = fls(~event); @@ -240,14 +232,14 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]); } - if (tcp_splice_epoll_ctl(c, conn)) - conn_flag(c, conn, CLOSING); + if (tcp_splice_epoll_ctl(conn)) + conn_flag(conn, CLOSING); } -#define conn_event(c, conn, event) \ +#define conn_event(conn, event) \ do { \ flow_trace(conn, "event at %s:%i",__func__, __LINE__); \ - conn_event_do(c, conn, event); \ + conn_event_do(conn, event); \ } while (0) @@ -313,14 +305,14 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (conn->pipe[sidei][0] < 0) { if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { - flow_err(conn, "cannot create %d->%d pipe: %s", - sidei, !sidei, strerror(errno)); - conn_flag(c, conn, CLOSING); + flow_perror(conn, "cannot create %d->%d pipe", + sidei, !sidei); + conn_flag(conn, CLOSING); return -EIO; } if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { flow_trace(conn, "cannot set %d->%d pipe size to %zu", sidei, !sidei, c->tcp.pipe_size); @@ -329,7 +321,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, } if (!(conn->events & SPLICE_ESTABLISHED)) - conn_event(c, conn, SPLICE_ESTABLISHED); + conn_event(conn, SPLICE_ESTABLISHED); return 0; } @@ -347,10 +339,10 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6; uint8_t tgtpif = conn->f.pif[TGTSIDE]; union sockaddr_inany sa; - socklen_t sl; + int one = 1; if (tgtpif == PIF_HOST) - conn->s[1] = tcp_conn_sock(c, af); + conn->s[1] = tcp_conn_sock(af); else if (tgtpif == PIF_SPLICE) conn->s[1] = tcp_conn_sock_ns(c, af); else @@ -359,24 +351,33 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) if (conn->s[1] < 0) return -1; - if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, - &((int){ 1 }), sizeof(int))) { + if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) { flow_trace(conn, "failed to set TCP_QUICKACK on socket %i", conn->s[1]); } - pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport); + if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) { + flow_trace(conn, "failed to set TCP_NODELAY on socket %i", + conn->s[0]); + } + + if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) { + flow_trace(conn, "failed to set TCP_NODELAY on socket %i", + conn->s[1]); + } + + pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport); + + conn_event(conn, SPLICE_CONNECT); - if (connect(conn->s[1], &sa.sa, sl)) { + if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) { if (errno != EINPROGRESS) { flow_trace(conn, "Couldn't connect socket for splice: %s", - strerror(errno)); + strerror_(errno)); return -errno; } - - conn_event(c, conn, SPLICE_CONNECT); } else { - conn_event(c, conn, SPLICE_ESTABLISHED); + conn_event(conn, SPLICE_ESTABLISHED); return tcp_splice_connect_finish(c, conn); } @@ -388,7 +389,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) * @c: Execution context * @af: Address family (AF_INET or AF_INET6) * - * Return: Socket fd in the namespace on success, -errno on failure + * Return: socket fd in the namespace on success, -errno on failure */ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) { @@ -436,7 +437,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); if (tcp_splice_connect(c, conn)) - conn_flag(c, conn, CLOSING); + conn_flag(conn, CLOSING); FLOW_ACTIVATE(conn); } @@ -468,11 +469,10 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); if (rc) - flow_err(conn, "Error retrieving SO_ERROR: %s", - strerror(errno)); + flow_perror(conn, "Error retrieving SO_ERROR"); else flow_trace(conn, "Error event on socket: %s", - strerror(err)); + strerror_(err)); goto close; } @@ -486,14 +486,14 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, if (events & EPOLLOUT) { fromsidei = !evsidei; - conn_event(c, conn, ~OUT_WAIT(evsidei)); + conn_event(conn, ~OUT_WAIT(evsidei)); } else { fromsidei = evsidei; } if (events & EPOLLRDHUP) /* For side 0 this is fake, but implied */ - conn_event(c, conn, FIN_RCVD(evsidei)); + conn_event(conn, FIN_RCVD(evsidei)); swap: eof = 0; @@ -503,49 +503,53 @@ swap: lowat_act_flag = RCVLOWAT_ACT(fromsidei); while (1) { - ssize_t readlen, to_write = 0, written; + ssize_t readlen, written, pending; int more = 0; retry: - readlen = splice(conn->s[fromsidei], NULL, - conn->pipe[fromsidei][1], NULL, - c->tcp.pipe_size, - SPLICE_F_MOVE | SPLICE_F_NONBLOCK); - flow_trace(conn, "%zi from read-side call", readlen); - if (readlen < 0) { - if (errno == EINTR) - goto retry; + do + readlen = splice(conn->s[fromsidei], NULL, + conn->pipe[fromsidei][1], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + while (readlen < 0 && errno == EINTR); + + if (readlen < 0 && errno != EAGAIN) + goto close; - if (errno != EAGAIN) - goto close; + flow_trace(conn, "%zi from read-side call", readlen); - to_write = c->tcp.pipe_size; - } else if (!readlen) { + if (!readlen) { eof = 1; - to_write = c->tcp.pipe_size; - } else { + } else if (readlen > 0) { never_read = 0; - to_write += readlen; + if (readlen >= (long)c->tcp.pipe_size * 90 / 100) more = SPLICE_F_MORE; if (conn->flags & lowat_set_flag) - conn_flag(c, conn, lowat_act_flag); + conn_flag(conn, lowat_act_flag); } -eintr: - written = splice(conn->pipe[fromsidei][0], NULL, - conn->s[!fromsidei], NULL, to_write, - SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + do + written = splice(conn->pipe[fromsidei][0], NULL, + conn->s[!fromsidei], NULL, + c->tcp.pipe_size, + SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + while (written < 0 && errno == EINTR); + + if (written < 0 && errno != EAGAIN) + goto close; + flow_trace(conn, "%zi from write-side call (passed %zi)", - written, to_write); + written, c->tcp.pipe_size); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { if (readlen >= (long)c->tcp.pipe_size * 10 / 100) continue; - if (conn->flags & lowat_set_flag && + if (!(conn->flags & lowat_set_flag) && readlen > (long)c->tcp.pipe_size / 10) { int lowat = c->tcp.pipe_size / 4; @@ -554,10 +558,10 @@ eintr: &lowat, sizeof(lowat))) { flow_trace(conn, "Setting SO_RCVLOWAT %i: %s", - lowat, strerror(errno)); + lowat, strerror_(errno)); } else { - conn_flag(c, conn, lowat_set_flag); - conn_flag(c, conn, lowat_act_flag); + conn_flag(conn, lowat_set_flag); + conn_flag(conn, lowat_act_flag); } } @@ -568,26 +572,19 @@ eintr: conn->written[fromsidei] += written > 0 ? written : 0; if (written < 0) { - if (errno == EINTR) - goto eintr; - - if (errno != EAGAIN) - goto close; - if (conn->read[fromsidei] == conn->written[fromsidei]) break; - conn_event(c, conn, OUT_WAIT(!fromsidei)); + conn_event(conn, OUT_WAIT(!fromsidei)); break; } if (never_read && written == (long)(c->tcp.pipe_size)) goto retry; - if (!never_read && written < to_write) { - to_write -= written; + pending = conn->read[fromsidei] - conn->written[fromsidei]; + if (!never_read && written > 0 && written < pending) goto retry; - } if (eof) break; @@ -600,7 +597,7 @@ eintr: if ((conn->events & FIN_RCVD(sidei)) && !(conn->events & FIN_SENT(!sidei))) { shutdown(conn->s[!sidei], SHUT_WR); - conn_event(c, conn, FIN_SENT(!sidei)); + conn_event(conn, FIN_SENT(!sidei)); } } } @@ -621,7 +618,7 @@ eintr: return; close: - conn_flag(c, conn, CLOSING); + conn_flag(conn, CLOSING); } /** @@ -676,7 +673,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c) continue; if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ, - c->tcp.pipe_size)) { + c->tcp.pipe_size) != (int)c->tcp.pipe_size) { trace("TCP (spliced): cannot set pool pipe size to %zu", c->tcp.pipe_size); } @@ -697,16 +694,16 @@ static int tcp_sock_refill_ns(void *arg) ns_enter(c); if (c->ifi4) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET); if (rc < 0) warn("TCP: Error refilling IPv4 ns socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } if (c->ifi6) { - int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6); if (rc < 0) warn("TCP: Error refilling IPv6 ns socket pool: %s", - strerror(-rc)); + strerror_(-rc)); } return 0; @@ -741,10 +738,9 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections - * @c: Execution context * @conn: Connection to handle */ -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) +void tcp_splice_timer(struct tcp_splice_conn *conn) { unsigned sidei; @@ -758,10 +754,10 @@ void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) flow_trace(conn, "can't set SO_RCVLOWAT on %d", conn->s[sidei]); } - conn_flag(c, conn, ~RCVLOWAT_SET(sidei)); + conn_flag(conn, ~RCVLOWAT_SET(sidei)); } } flow_foreach_sidei(sidei) - conn_flag(c, conn, ~RCVLOWAT_ACT(sidei)); + conn_flag(conn, ~RCVLOWAT_ACT(sidei)); } |
