aboutgitcodebugslistschat
path: root/tcp_splice.c
diff options
context:
space:
mode:
Diffstat (limited to 'tcp_splice.c')
-rw-r--r--tcp_splice.c234
1 files changed, 115 insertions, 119 deletions
diff --git a/tcp_splice.c b/tcp_splice.c
index 483e45d..a7c04ca 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -28,7 +28,7 @@
* - FIN_SENT_0: FIN (write shutdown) sent to accepted socket
* - FIN_SENT_1: FIN (write shutdown) sent to target socket
*
- * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64
+ * #syscalls:pasta pipe2|pipe fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
*/
#include <sched.h>
@@ -44,7 +44,6 @@
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
-#include <sys/epoll.h>
#include <sys/types.h>
#include <sys/socket.h>
@@ -56,6 +55,7 @@
#include "siphash.h"
#include "inany.h"
#include "flow.h"
+#include "epoll_ctl.h"
#include "flow_table.h"
@@ -95,7 +95,7 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af);
* conn_at_sidx() - Get spliced TCP connection specific flow at given sidx
* @sidx: Flow and side to retrieve
*
- * Return: Spliced TCP connection at @sidx, or NULL of @sidx is invalid.
+ * Return: spliced TCP connection at @sidx, or NULL of @sidx is invalid.
* Asserts if the flow at @sidx is not FLOW_TCP_SPLICE.
*/
static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
@@ -114,68 +114,62 @@ static struct tcp_splice_conn *conn_at_sidx(flow_sidx_t sidx)
* @events: Connection event flags
* @ev: Events to fill in, 0 is accepted socket, 1 is connecting socket
*/
-static void tcp_splice_conn_epoll_events(uint16_t events,
- struct epoll_event ev[])
+static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei)
{
- unsigned sidei;
-
- flow_foreach_sidei(sidei)
- ev[sidei].events = 0;
+ uint32_t e = 0;
if (events & SPLICE_ESTABLISHED) {
- flow_foreach_sidei(sidei) {
- if (!(events & FIN_SENT(!sidei)))
- ev[sidei].events = EPOLLIN | EPOLLRDHUP;
- }
- } else if (events & SPLICE_CONNECT) {
- ev[1].events = EPOLLOUT;
+ if (!(events & FIN_SENT(!sidei)))
+ e = EPOLLIN | EPOLLRDHUP;
+ } else if (sidei == 1 && events & SPLICE_CONNECT) {
+ e = EPOLLOUT;
}
- flow_foreach_sidei(sidei)
- ev[sidei].events |= (events & OUT_WAIT(sidei)) ? EPOLLOUT : 0;
+ if (events & OUT_WAIT(sidei))
+ e |= EPOLLOUT;
+ if (events & OUT_WAIT(!sidei))
+ e &= ~EPOLLIN;
+
+ return e;
}
/**
* tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events
- * @c: Execution context
* @conn: Connection pointer
*
* Return: 0 on success, negative error code on failure (not on deletion)
*/
-static int tcp_splice_epoll_ctl(const struct ctx *c,
- struct tcp_splice_conn *conn)
+static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn)
{
- int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
- const union epoll_ref ref[SIDES] = {
- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0],
- .flowside = FLOW_SIDX(conn, 0) },
- { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1],
- .flowside = FLOW_SIDX(conn, 1) }
- };
- struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 },
- { .data.u64 = ref[1].u64 } };
-
- tcp_splice_conn_epoll_events(conn->events, ev);
-
- if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) ||
- epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) {
+ uint32_t events[2];
+ int m;
+
+ if (flow_in_epoll(&conn->f)) {
+ m = EPOLL_CTL_MOD;
+ } else {
+ flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
+ m = EPOLL_CTL_ADD;
+ }
+
+ events[0] = tcp_splice_conn_epoll_events(conn->events, 0);
+ events[1] = tcp_splice_conn_epoll_events(conn->events, 1);
+
+ if (flow_epoll_set(&conn->f, m, events[0], conn->s[0], 0) ||
+ flow_epoll_set(&conn->f, m, events[1], conn->s[1], 1)) {
int ret = -errno;
- flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno));
+ flow_perror(conn, "ERROR on epoll_ctl()");
return ret;
}
- conn->in_epoll = true;
-
return 0;
}
/**
* conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag
- * @c: Execution context
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
+static void conn_flag_do(struct tcp_splice_conn *conn,
unsigned long flag)
{
if (flag & (flag - 1)) {
@@ -200,25 +194,23 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
+ epoll_del(flow_epollfd(&conn->f), conn->s[0]);
+ epoll_del(flow_epollfd(&conn->f), conn->s[1]);
}
}
-#define conn_flag(c, conn, flag) \
+#define conn_flag(conn, flag) \
do { \
flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
- conn_flag_do(c, conn, flag); \
+ conn_flag_do(conn, flag); \
} while (0)
/**
* conn_event_do() - Set and log connection events, update epoll state
- * @c: Execution context
* @conn: Connection pointer
* @event: Connection event
*/
-static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
- unsigned long event)
+static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event)
{
if (event & (event - 1)) {
int flag_index = fls(~event);
@@ -240,14 +232,14 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn,
flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]);
}
- if (tcp_splice_epoll_ctl(c, conn))
- conn_flag(c, conn, CLOSING);
+ if (tcp_splice_epoll_ctl(conn))
+ conn_flag(conn, CLOSING);
}
-#define conn_event(c, conn, event) \
+#define conn_event(conn, event) \
do { \
flow_trace(conn, "event at %s:%i",__func__, __LINE__); \
- conn_event_do(c, conn, event); \
+ conn_event_do(conn, event); \
} while (0)
@@ -313,14 +305,14 @@ static int tcp_splice_connect_finish(const struct ctx *c,
if (conn->pipe[sidei][0] < 0) {
if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) {
- flow_err(conn, "cannot create %d->%d pipe: %s",
- sidei, !sidei, strerror(errno));
- conn_flag(c, conn, CLOSING);
+ flow_perror(conn, "cannot create %d->%d pipe",
+ sidei, !sidei);
+ conn_flag(conn, CLOSING);
return -EIO;
}
if (fcntl(conn->pipe[sidei][0], F_SETPIPE_SZ,
- c->tcp.pipe_size)) {
+ c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
flow_trace(conn,
"cannot set %d->%d pipe size to %zu",
sidei, !sidei, c->tcp.pipe_size);
@@ -329,7 +321,7 @@ static int tcp_splice_connect_finish(const struct ctx *c,
}
if (!(conn->events & SPLICE_ESTABLISHED))
- conn_event(c, conn, SPLICE_ESTABLISHED);
+ conn_event(conn, SPLICE_ESTABLISHED);
return 0;
}
@@ -347,10 +339,10 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6;
uint8_t tgtpif = conn->f.pif[TGTSIDE];
union sockaddr_inany sa;
- socklen_t sl;
+ int one = 1;
if (tgtpif == PIF_HOST)
- conn->s[1] = tcp_conn_sock(c, af);
+ conn->s[1] = tcp_conn_sock(af);
else if (tgtpif == PIF_SPLICE)
conn->s[1] = tcp_conn_sock_ns(c, af);
else
@@ -359,24 +351,33 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
if (conn->s[1] < 0)
return -1;
- if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK,
- &((int){ 1 }), sizeof(int))) {
+ if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &one, sizeof(one))) {
flow_trace(conn, "failed to set TCP_QUICKACK on socket %i",
conn->s[1]);
}
- pif_sockaddr(c, &sa, &sl, tgtpif, &tgt->eaddr, tgt->eport);
+ if (setsockopt(conn->s[0], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
+ flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
+ conn->s[0]);
+ }
+
+ if (setsockopt(conn->s[1], SOL_TCP, TCP_NODELAY, &one, sizeof(one))) {
+ flow_trace(conn, "failed to set TCP_NODELAY on socket %i",
+ conn->s[1]);
+ }
+
+ pif_sockaddr(c, &sa, tgtpif, &tgt->eaddr, tgt->eport);
+
+ conn_event(conn, SPLICE_CONNECT);
- if (connect(conn->s[1], &sa.sa, sl)) {
+ if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) {
if (errno != EINPROGRESS) {
flow_trace(conn, "Couldn't connect socket for splice: %s",
- strerror(errno));
+ strerror_(errno));
return -errno;
}
-
- conn_event(c, conn, SPLICE_CONNECT);
} else {
- conn_event(c, conn, SPLICE_ESTABLISHED);
+ conn_event(conn, SPLICE_ESTABLISHED);
return tcp_splice_connect_finish(c, conn);
}
@@ -388,7 +389,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn)
* @c: Execution context
* @af: Address family (AF_INET or AF_INET6)
*
- * Return: Socket fd in the namespace on success, -errno on failure
+ * Return: socket fd in the namespace on success, -errno on failure
*/
static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af)
{
@@ -436,7 +437,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0)
flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0);
if (tcp_splice_connect(c, conn))
- conn_flag(c, conn, CLOSING);
+ conn_flag(conn, CLOSING);
FLOW_ACTIVATE(conn);
}
@@ -468,11 +469,10 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl);
if (rc)
- flow_err(conn, "Error retrieving SO_ERROR: %s",
- strerror(errno));
+ flow_perror(conn, "Error retrieving SO_ERROR");
else
flow_trace(conn, "Error event on socket: %s",
- strerror(err));
+ strerror_(err));
goto close;
}
@@ -486,14 +486,14 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref,
if (events & EPOLLOUT) {
fromsidei = !evsidei;
- conn_event(c, conn, ~OUT_WAIT(evsidei));
+ conn_event(conn, ~OUT_WAIT(evsidei));
} else {
fromsidei = evsidei;
}
if (events & EPOLLRDHUP)
/* For side 0 this is fake, but implied */
- conn_event(c, conn, FIN_RCVD(evsidei));
+ conn_event(conn, FIN_RCVD(evsidei));
swap:
eof = 0;
@@ -503,49 +503,53 @@ swap:
lowat_act_flag = RCVLOWAT_ACT(fromsidei);
while (1) {
- ssize_t readlen, to_write = 0, written;
+ ssize_t readlen, written, pending;
int more = 0;
retry:
- readlen = splice(conn->s[fromsidei], NULL,
- conn->pipe[fromsidei][1], NULL,
- c->tcp.pipe_size,
- SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
- flow_trace(conn, "%zi from read-side call", readlen);
- if (readlen < 0) {
- if (errno == EINTR)
- goto retry;
+ do
+ readlen = splice(conn->s[fromsidei], NULL,
+ conn->pipe[fromsidei][1], NULL,
+ c->tcp.pipe_size,
+ SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ while (readlen < 0 && errno == EINTR);
+
+ if (readlen < 0 && errno != EAGAIN)
+ goto close;
- if (errno != EAGAIN)
- goto close;
+ flow_trace(conn, "%zi from read-side call", readlen);
- to_write = c->tcp.pipe_size;
- } else if (!readlen) {
+ if (!readlen) {
eof = 1;
- to_write = c->tcp.pipe_size;
- } else {
+ } else if (readlen > 0) {
never_read = 0;
- to_write += readlen;
+
if (readlen >= (long)c->tcp.pipe_size * 90 / 100)
more = SPLICE_F_MORE;
if (conn->flags & lowat_set_flag)
- conn_flag(c, conn, lowat_act_flag);
+ conn_flag(conn, lowat_act_flag);
}
-eintr:
- written = splice(conn->pipe[fromsidei][0], NULL,
- conn->s[!fromsidei], NULL, to_write,
- SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ do
+ written = splice(conn->pipe[fromsidei][0], NULL,
+ conn->s[!fromsidei], NULL,
+ c->tcp.pipe_size,
+ SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK);
+ while (written < 0 && errno == EINTR);
+
+ if (written < 0 && errno != EAGAIN)
+ goto close;
+
flow_trace(conn, "%zi from write-side call (passed %zi)",
- written, to_write);
+ written, c->tcp.pipe_size);
/* Most common case: skip updating counters. */
if (readlen > 0 && readlen == written) {
if (readlen >= (long)c->tcp.pipe_size * 10 / 100)
continue;
- if (conn->flags & lowat_set_flag &&
+ if (!(conn->flags & lowat_set_flag) &&
readlen > (long)c->tcp.pipe_size / 10) {
int lowat = c->tcp.pipe_size / 4;
@@ -554,10 +558,10 @@ eintr:
&lowat, sizeof(lowat))) {
flow_trace(conn,
"Setting SO_RCVLOWAT %i: %s",
- lowat, strerror(errno));
+ lowat, strerror_(errno));
} else {
- conn_flag(c, conn, lowat_set_flag);
- conn_flag(c, conn, lowat_act_flag);
+ conn_flag(conn, lowat_set_flag);
+ conn_flag(conn, lowat_act_flag);
}
}
@@ -568,26 +572,19 @@ eintr:
conn->written[fromsidei] += written > 0 ? written : 0;
if (written < 0) {
- if (errno == EINTR)
- goto eintr;
-
- if (errno != EAGAIN)
- goto close;
-
if (conn->read[fromsidei] == conn->written[fromsidei])
break;
- conn_event(c, conn, OUT_WAIT(!fromsidei));
+ conn_event(conn, OUT_WAIT(!fromsidei));
break;
}
if (never_read && written == (long)(c->tcp.pipe_size))
goto retry;
- if (!never_read && written < to_write) {
- to_write -= written;
+ pending = conn->read[fromsidei] - conn->written[fromsidei];
+ if (!never_read && written > 0 && written < pending)
goto retry;
- }
if (eof)
break;
@@ -600,7 +597,7 @@ eintr:
if ((conn->events & FIN_RCVD(sidei)) &&
!(conn->events & FIN_SENT(!sidei))) {
shutdown(conn->s[!sidei], SHUT_WR);
- conn_event(c, conn, FIN_SENT(!sidei));
+ conn_event(conn, FIN_SENT(!sidei));
}
}
}
@@ -621,7 +618,7 @@ eintr:
return;
close:
- conn_flag(c, conn, CLOSING);
+ conn_flag(conn, CLOSING);
}
/**
@@ -676,7 +673,7 @@ static void tcp_splice_pipe_refill(const struct ctx *c)
continue;
if (fcntl(splice_pipe_pool[i][0], F_SETPIPE_SZ,
- c->tcp.pipe_size)) {
+ c->tcp.pipe_size) != (int)c->tcp.pipe_size) {
trace("TCP (spliced): cannot set pool pipe size to %zu",
c->tcp.pipe_size);
}
@@ -697,16 +694,16 @@ static int tcp_sock_refill_ns(void *arg)
ns_enter(c);
if (c->ifi4) {
- int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET);
+ int rc = tcp_sock_refill_pool(ns_sock_pool4, AF_INET);
if (rc < 0)
warn("TCP: Error refilling IPv4 ns socket pool: %s",
- strerror(-rc));
+ strerror_(-rc));
}
if (c->ifi6) {
- int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6);
+ int rc = tcp_sock_refill_pool(ns_sock_pool6, AF_INET6);
if (rc < 0)
warn("TCP: Error refilling IPv6 ns socket pool: %s",
- strerror(-rc));
+ strerror_(-rc));
}
return 0;
@@ -741,10 +738,9 @@ void tcp_splice_init(struct ctx *c)
/**
* tcp_splice_timer() - Timer for spliced connections
- * @c: Execution context
* @conn: Connection to handle
*/
-void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
+void tcp_splice_timer(struct tcp_splice_conn *conn)
{
unsigned sidei;
@@ -758,10 +754,10 @@ void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn)
flow_trace(conn, "can't set SO_RCVLOWAT on %d",
conn->s[sidei]);
}
- conn_flag(c, conn, ~RCVLOWAT_SET(sidei));
+ conn_flag(conn, ~RCVLOWAT_SET(sidei));
}
}
flow_foreach_sidei(sidei)
- conn_flag(c, conn, ~RCVLOWAT_ACT(sidei));
+ conn_flag(conn, ~RCVLOWAT_ACT(sidei));
}