diff options
| -rw-r--r-- | flow.c | 14 | ||||
| -rw-r--r-- | flow.h | 6 | ||||
| -rw-r--r-- | icmp.c | 27 | ||||
| -rw-r--r-- | icmp.h | 3 | ||||
| -rw-r--r-- | migrate.c | 27 | ||||
| -rw-r--r-- | migrate.h | 8 | ||||
| -rw-r--r-- | passt.c | 8 | ||||
| -rw-r--r-- | tcp.c | 373 | ||||
| -rw-r--r-- | tcp.h | 5 | ||||
| -rw-r--r-- | tcp_buf.c | 78 | ||||
| -rw-r--r-- | tcp_buf.h | 8 | ||||
| -rw-r--r-- | tcp_conn.h | 8 | ||||
| -rw-r--r-- | tcp_internal.h | 27 | ||||
| -rw-r--r-- | tcp_splice.c | 59 | ||||
| -rw-r--r-- | tcp_splice.h | 3 | ||||
| -rw-r--r-- | tcp_vu.c | 42 | ||||
| -rw-r--r-- | tcp_vu.h | 6 | ||||
| -rw-r--r-- | udp.c | 47 | ||||
| -rw-r--r-- | udp_flow.c | 19 |
19 files changed, 449 insertions, 319 deletions
@@ -1087,16 +1087,18 @@ static int flow_migrate_repair_all(struct ctx *c, bool enable) * @c: Execution context * @stage: Migration stage information (unused) * @fd: Migration file descriptor (unused) + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, - int fd) + int fd, const struct timespec *now) { int rc; (void)stage; (void)fd; + (void)now; if (flow_migrate_need_repair()) repair_wait(c); @@ -1112,11 +1114,12 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, * @c: Execution context (unused) * @stage: Migration stage information (unused) * @fd: Migration file descriptor + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, - int fd) + int fd, const struct timespec *now) { uint32_t count = 0; bool first = true; @@ -1187,7 +1190,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * as EIO). */ foreach_established_tcp_flow(flow) { - rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp); + rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp, now); if (rc) { flow_err(flow, "Can't send extended data: %s", strerror_(-rc)); @@ -1205,11 +1208,12 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, * @c: Execution context * @stage: Migration stage information (unused) * @fd: Migration file descriptor + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, - int fd) + int fd, const struct timespec *now) { uint32_t count; unsigned i; @@ -1246,7 +1250,7 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, repair_flush(c); for (i = 0; i < count; i++) { - rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd); + rc = tcp_flow_migrate_target_ext(c, &flowtab[i].tcp, fd, now); if (rc) { flow_dbg(FLOW(i), "Migration data failure, abort: %s", strerror_(-rc)); @@ -274,11 +274,11 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now); int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, int fd); int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, - int fd); + int fd, const struct timespec *now); int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, - int fd); + int fd, const struct timespec *now); int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, - int fd); + int fd, const struct timespec *now); void flow_log__(const struct flow_common *f, int pri, bool perror, bool details, enum flow_state state, const char *fmt, ...); @@ -66,8 +66,10 @@ static struct icmp_ping_flow *ping_at_sidx(flow_sidx_t sidx) * icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket * @c: Execution context * @ref: epoll reference + * @now: Current timestamp */ -void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) +void icmp_sock_handler(const struct ctx *c, union epoll_ref ref, + const struct timespec *now) { struct icmp_ping_flow *pingf = ping_at_sidx(ref.flowside); const struct flowside *ini = &pingf->f.side[INISIDE]; @@ -84,7 +86,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) { - flow_perror(pingf, "recvfrom() error"); + flow_perror_ratelimit(pingf, now, "recvfrom() error"); return; } @@ -142,7 +144,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) return; unexpected: - flow_err(pingf, "Unexpected packet on ping socket"); + flow_err_ratelimit(pingf, now, "Unexpected packet on ping socket"); } /** @@ -165,12 +167,15 @@ static void icmp_ping_close(const struct ctx *c, * @id: ICMP id for the new socket * @saddr: Source address * @daddr: Destination address + * @now: Current timestamp * * Return: newly opened ping flow, or NULL on failure */ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, sa_family_t af, uint16_t id, - const void *saddr, const void *daddr) + const void *saddr, + const void *daddr, + const struct timespec *now) { uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6; @@ -186,10 +191,11 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, goto cancel; if (flow->f.pif[TGTSIDE] != PIF_HOST) { - flow_err(flow, "No support for forwarding %s from %s to %s", - proto == IPPROTO_ICMP ? "ICMP" : "ICMPv6", - pif_name(flow->f.pif[INISIDE]), - pif_name(flow->f.pif[TGTSIDE])); + flow_err_ratelimit( + flow, now, "No support for forwarding %s from %s to %s", + proto == IPPROTO_ICMP ? "ICMP" : "ICMPv6", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); goto cancel; } @@ -299,7 +305,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (flow) pingf = &flow->ping; - else if (!(pingf = icmp_ping_new(c, af, id, saddr, daddr))) + else if (!(pingf = icmp_ping_new(c, af, id, saddr, daddr, now))) return 1; tgt = &pingf->f.side[TGTSIDE]; @@ -317,7 +323,8 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, msh.msg_flags = 0; if (sendmsg(pingf->sock, &msh, MSG_NOSIGNAL) < 0) { - flow_dbg_perror(pingf, "failed to relay request to socket"); + flow_warn_perror_ratelimit(pingf, now, + "failed to relay request to socket"); } else { flow_dbg(pingf, "echo request to socket, ID: %"PRIu16", seq: %"PRIu16, @@ -13,7 +13,8 @@ struct ctx; struct icmp_ping_flow; -void icmp_sock_handler(const struct ctx *c, union epoll_ref ref); +void icmp_sock_handler(const struct ctx *c, union epoll_ref ref, + const struct timespec *now); int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, struct iov_tail *data, const struct timespec *now); @@ -48,12 +48,14 @@ struct migrate_seen_addrs_v2 { * @c: Execution context * @stage: Migration stage, unused * @fd: File descriptor for state transfer + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ /* cppcheck-suppress [constParameterCallback, unmatchedSuppression] */ static int seen_addrs_source_v2(struct ctx *c, - const struct migrate_stage *stage, int fd) + const struct migrate_stage *stage, int fd, + const struct timespec *now) { struct migrate_seen_addrs_v2 addrs = { .addr6 = c->ip6.addr_seen, @@ -62,6 +64,7 @@ static int seen_addrs_source_v2(struct ctx *c, }; (void)stage; + (void)now; memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac)); @@ -76,15 +79,18 @@ static int seen_addrs_source_v2(struct ctx *c, * @c: Execution context * @stage: Migration stage, unused * @fd: File descriptor for state transfer + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ static int seen_addrs_target_v2(struct ctx *c, - const struct migrate_stage *stage, int fd) + const struct migrate_stage *stage, int fd, + const struct timespec *now) { struct migrate_seen_addrs_v2 addrs; (void)stage; + (void)now; if (read_all_buf(fd, &addrs, sizeof(addrs))) return errno; @@ -133,10 +139,11 @@ static const struct migrate_version versions[] = { * migrate_source() - Migration as source, send state to hypervisor * @c: Execution context * @fd: File descriptor for state transfer + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ -static int migrate_source(struct ctx *c, int fd) +static int migrate_source(struct ctx *c, int fd, const struct timespec *now) { const struct migrate_version *v = CURRENT_VERSION; const struct migrate_header header = { @@ -159,7 +166,7 @@ static int migrate_source(struct ctx *c, int fd) debug("Source side migration stage: %s", s->name); - if ((ret = s->source(c, s, fd))) { + if ((ret = s->source(c, s, fd, now))) { err("Source migration stage: %s: %s, abort", s->name, strerror_(ret)); return ret; @@ -209,10 +216,11 @@ static const struct migrate_version *migrate_target_read_header(int fd) * migrate_target() - Migration as target, receive state from hypervisor * @c: Execution context * @fd: File descriptor for state transfer + * @now: Current timestamp * * Return: 0 on success, positive error code on failure */ -static int migrate_target(struct ctx *c, int fd) +static int migrate_target(struct ctx *c, int fd, const struct timespec *now) { const struct migrate_version *v; const struct migrate_stage *s; @@ -227,7 +235,7 @@ static int migrate_target(struct ctx *c, int fd) debug("Target side migration stage: %s", s->name); - if ((ret = s->target(c, s, fd))) { + if ((ret = s->target(c, s, fd, now))) { err("Target migration stage: %s: %s, abort", s->name, strerror_(ret)); return ret; @@ -282,8 +290,9 @@ void migrate_request(struct ctx *c, int fd, bool target) /** * migrate_handler() - Send/receive passt internal state to/from hypervisor * @c: Execution context + * @now: Current timestamp */ -void migrate_handler(struct ctx *c) +void migrate_handler(struct ctx *c, const struct timespec *now) { int rc; @@ -294,9 +303,9 @@ void migrate_handler(struct ctx *c) c->device_state_fd, c->migrate_target); if (c->migrate_target) - rc = migrate_target(c, c->device_state_fd); + rc = migrate_target(c, c->device_state_fd, now); else - rc = migrate_source(c, c->device_state_fd); + rc = migrate_source(c, c->device_state_fd, now); migrate_close(c); @@ -30,8 +30,10 @@ struct migrate_header { */ struct migrate_stage { const char *name; - int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd); - int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd); + int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd, + const struct timespec *now); + int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd, + const struct timespec *now); /* Add here separate rollback callbacks if needed */ }; @@ -49,6 +51,6 @@ struct migrate_version { void migrate_init(struct ctx *c); void migrate_close(struct ctx *c); void migrate_request(struct ctx *c, int fd, bool target); -void migrate_handler(struct ctx *c); +void migrate_handler(struct ctx *c, const struct timespec *now); #endif /* MIGRATE_H */ @@ -251,7 +251,7 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) pasta_netns_quit_timer_handler(c, ref); break; case EPOLL_TYPE_TCP: - tcp_sock_handler(c, ref, eventmask); + tcp_sock_handler(c, ref, eventmask, &now); break; case EPOLL_TYPE_TCP_SPLICE: tcp_splice_sock_handler(c, ref, eventmask, &now); @@ -260,7 +260,7 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) tcp_listen_handler(c, ref, &now); break; case EPOLL_TYPE_TCP_TIMER: - tcp_timer_handler(c, ref); + tcp_timer_handler(c, ref, &now); break; case EPOLL_TYPE_UDP_LISTEN: udp_listen_sock_handler(c, ref, eventmask, &now); @@ -269,7 +269,7 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) udp_sock_handler(c, ref, eventmask, &now); break; case EPOLL_TYPE_PING: - icmp_sock_handler(c, ref); + icmp_sock_handler(c, ref, &now); break; case EPOLL_TYPE_VHOST_CMD: vu_control_handler(c->vdev, c->fd_tap, eventmask); @@ -302,7 +302,7 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) post_handler(c, &now); - migrate_handler(c); + migrate_handler(c, &now); } /** @@ -464,17 +464,21 @@ static struct tcp_tap_conn *conn_at_sidx(flow_sidx_t sidx) * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on connection if supported * @conn: Pointer to the TCP connection structure * @offset: Offset in bytes + * @now: Current timestamp * * Return: -1 when it fails, 0 otherwise. */ -int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset) +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset, + const struct timespec *now) { if (!peek_offset_cap) return 0; if (setsockopt(conn->sock, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) { - flow_perror(conn, "Failed to set SO_PEEK_OFF to %i", offset); + flow_perror_ratelimit(conn, now, + "Failed to set SO_PEEK_OFF to %i", + offset); return -1; } return 0; @@ -545,9 +549,12 @@ static int tcp_epoll_ctl(struct tcp_tap_conn *conn) * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp + * * #syscalls timerfd_create timerfd_settime|timerfd_settime32 */ -static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { struct itimerspec it = { { 0 }, { 0 } }; @@ -560,12 +567,13 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1) { - flow_dbg_perror(conn, "failed to get timer"); + flow_perror_ratelimit(conn, now, "failed to get timer"); return; } if (fd > FD_REF_MAX) { - flow_dbg(conn, "timer fd overflow (%d > %d)", - fd, FD_REF_MAX); + flow_err_ratelimit(conn, now, + "timer fd overflow (%d > %d)", + fd, FD_REF_MAX); close(fd); return; } @@ -575,7 +583,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) ref.fd = fd; if (epoll_add(flow_epollfd(&conn->f), EPOLLIN | EPOLLET, ref) < 0) { - flow_dbg(conn, "failed to add timer"); + flow_perror_ratelimit(conn, now, "failed to add timer"); close(fd); return; } @@ -622,9 +630,10 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset + * @now: Current timestamp */ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag) + unsigned long flag, const struct timespec *now) { if (flag & (flag - 1)) { int flag_index = fls(~flag); @@ -646,7 +655,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, * flags and factor this into the logic below. */ if (flag == ACK_FROM_TAP_DUE) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(c, conn, now); return; } @@ -662,7 +671,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(c, conn, now); } /** @@ -670,9 +679,10 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, * @c: Execution context * @conn: Connection pointer * @event: Connection event + * @now: Current timestamp */ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event) + unsigned long event, const struct timespec *now) { int prev, new, num = fls(event); @@ -710,7 +720,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, num == -1 ? "CLOSED" : tcp_event_str[num]); if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) { - conn_flag(c, conn, ACTIVE_CLOSE); + conn_flag(c, conn, ACTIVE_CLOSE, now); } else { if (event == CLOSED) flow_hash_remove(c, TAP_SIDX(conn)); @@ -1101,13 +1111,15 @@ static uint32_t tcp_wnd_from_sndbuf(int s, struct tcp_tap_conn *conn, * @conn: Connection pointer * @force_seq: Force ACK sequence to latest segment, instead of checking socket * @tinfo: tcp_info from kernel, can be NULL if not pre-fetched + * @now: Current timestamp * * Return: 1 if sequence or window were updated, 0 otherwise * * #syscalls ioctl */ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info_linux *tinfo) + bool force_seq, struct tcp_info_linux *tinfo, + const struct timespec *now) { uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; @@ -1221,7 +1233,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, */ /* cppcheck-suppress [knownConditionTrueFalse, unmatchedSuppression] */ if (!conn->wnd_to_tap) - conn_flag(c, conn, ACK_TO_TAP_DUE); + conn_flag(c, conn, ACK_TO_TAP_DUE, now); out: /* Opportunistically store RTT approximation on valid TCP_INFO data */ @@ -1237,17 +1249,19 @@ out: * @c: Execution context * @conn: Connection pointer * @seq: Current ACK sequence, host order + * @now: Current timestamp */ static void tcp_update_seqack_from_tap(const struct ctx *c, - struct tcp_tap_conn *conn, uint32_t seq) + struct tcp_tap_conn *conn, uint32_t seq, + const struct timespec *now) { if (seq == conn->seq_to_tap) - conn_flag(c, conn, ~ACK_FROM_TAP_DUE); + conn_flag(c, conn, ~ACK_FROM_TAP_DUE, now); if (SEQ_GT(seq, conn->seq_ack_from_tap)) { /* Forward progress, but more data to acknowledge: reschedule */ if (SEQ_LT(seq, conn->seq_to_tap)) - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); conn->retries = 0; conn->seq_ack_from_tap = seq; @@ -1258,16 +1272,18 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, * tcp_rewind_seq() - Rewind sequence to tap and socket offset to current ACK * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp * * Return: 0 on success, -1 on failure, with connection reset */ -static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { conn->seq_to_tap = conn->seq_ack_from_tap; conn->events &= ~TAP_FIN_SENT; - if (tcp_set_peek_offset(conn, 0)) { - tcp_rst(c, conn); + if (tcp_set_peek_offset(conn, 0, now)) { + tcp_rst(c, conn, now); return -1; } @@ -1282,6 +1298,7 @@ static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) * @th: TCP header to update * @opts: TCP option buffer (output parameter) * @optlen: size of the TCP option buffer (output parameter) + * @now: Current timestamp * * Return: < 0 error code on connection reset, * 0 if there is no flag to send @@ -1289,7 +1306,7 @@ static int tcp_rewind_seq(const struct ctx *c, struct tcp_tap_conn *conn) */ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, - size_t *optlen) + size_t *optlen, const struct timespec *now) { struct tcp_info_linux tinfo = { 0 }; socklen_t sl = sizeof(tinfo); @@ -1297,19 +1314,19 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && !flags && conn->wnd_to_tap) { - conn_flag(c, conn, ~ACK_TO_TAP_DUE); + conn_flag(c, conn, ~ACK_TO_TAP_DUE, now); return 0; } if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { - conn_event(c, conn, CLOSED); + conn_event(c, conn, CLOSED, now); return -ECONNRESET; } if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); - if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo) && !flags) + if (!tcp_update_seqack_wnd(c, conn, !!flags, &tinfo, now) && !flags) return 0; *optlen = 0; @@ -1351,13 +1368,13 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && conn->wnd_to_tap) - conn_flag(c, conn, ~ACK_TO_TAP_DUE); + conn_flag(c, conn, ~ACK_TO_TAP_DUE, now); else - conn_flag(c, conn, ACK_TO_TAP_DUE); + conn_flag(c, conn, ACK_TO_TAP_DUE, now); } if (th->fin) - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */ if (th->fin || th->syn) @@ -1371,18 +1388,19 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @now: Current timestamp * * Return: negative error code on fatal connection failure, 0 otherwise */ static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, - int flags) + int flags, const struct timespec *now) { int ret; if (c->mode == MODE_VU) - ret = tcp_vu_send_flag(c, conn, flags); + ret = tcp_vu_send_flag(c, conn, flags, now); else - ret = tcp_buf_send_flag(c, conn, flags); + ret = tcp_buf_send_flag(c, conn, flags, now); return ret == -EAGAIN ? 0 : ret; } @@ -1413,28 +1431,32 @@ void tcp_linger0_(const struct flow_common *f, int s) * tcp_sock_rst() - Close TCP connection forcing RST on socket side * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp */ -static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_sock_rst(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { /* Force RST on socket to inform the peer */ tcp_linger0(conn, conn->sock); - conn_event(c, conn, CLOSED); + conn_event(c, conn, CLOSED, now); } /** * tcp_rst_do() - Reset a tap connection: send RST segment on both sides, close * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp */ -void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { if (conn->events == CLOSED) return; /* Send RST on tap */ - tcp_send_flag(c, conn, RST); + tcp_send_flag(c, conn, RST, now); - tcp_sock_rst(c, conn); + tcp_sock_rst(c, conn, now); } /** @@ -1459,11 +1481,13 @@ static void tcp_get_tap_ws(struct tcp_tap_conn *conn, * @c: Execution context * @conn: Connection pointer * @wnd: Window value, host order, unscaled + * @now: Current timestamp * * Return: false on zero window (not stored to wnd_from_tap), true otherwise */ static bool tcp_tap_window_update(const struct ctx *c, - struct tcp_tap_conn *conn, unsigned wnd) + struct tcp_tap_conn *conn, unsigned wnd, + const struct timespec *now) { wnd = MIN(MAX_WINDOW, wnd << conn->ws_from_tap); @@ -1474,7 +1498,7 @@ static bool tcp_tap_window_update(const struct ctx *c, * that no data beyond the updated window will be acknowledged. */ if (!wnd && SEQ_LT(conn->seq_ack_from_tap, conn->seq_to_tap)) { - tcp_rewind_seq(c, conn); + tcp_rewind_seq(c, conn, now); return false; } @@ -1600,9 +1624,11 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, * @c: Execution context * @conn: Connection entry for socket to bind * @s: Outbound TCP socket + * @now: Current timestamp */ static void tcp_bind_outbound(const struct ctx *c, - const struct tcp_tap_conn *conn, int s) + const struct tcp_tap_conn *conn, int s, + const struct timespec *now) { const struct flowside *tgt = &conn->f.side[TGTSIDE]; union sockaddr_inany bind_sa; @@ -1613,10 +1639,11 @@ static void tcp_bind_outbound(const struct ctx *c, if (bind(s, &bind_sa.sa, socklen_inany(&bind_sa))) { char sstr[INANY_ADDRSTRLEN]; - flow_dbg_perror(conn, - "Can't bind TCP outbound socket to %s:%hu", - inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), - tgt->oport); + flow_warn_perror_ratelimit( + conn, now, + "Can't bind TCP outbound socket to %s:%hu", + inany_ntop(&tgt->oaddr, sstr, sizeof(sstr)), + tgt->oport); } } @@ -1625,9 +1652,10 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip4.ifname_out, strlen(c->ip4.ifname_out))) { - flow_dbg_perror(conn, - "Can't bind IPv4 TCP socket to interface %s", - c->ip4.ifname_out); + flow_warn_perror_ratelimit( + conn, now, + "Can't bind IPv4 TCP socket to interface %s", + c->ip4.ifname_out); } } } else if (bind_sa.sa_family == AF_INET6) { @@ -1635,9 +1663,10 @@ static void tcp_bind_outbound(const struct ctx *c, if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE, c->ip6.ifname_out, strlen(c->ip6.ifname_out))) { - flow_dbg_perror(conn, - "Can't bind IPv6 TCP socket to interface %s", - c->ip6.ifname_out); + flow_warn_perror_ratelimit( + conn, now, + "Can't bind IPv6 TCP socket to interface %s", + c->ip6.ifname_out); } } } @@ -1681,9 +1710,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, goto cancel; if (flow->f.pif[TGTSIDE] != PIF_HOST) { - flow_err(flow, "No support for forwarding TCP from %s to %s", - pif_name(flow->f.pif[INISIDE]), - pif_name(flow->f.pif[TGTSIDE])); + flow_err_ratelimit( + flow, now, + "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); goto cancel; } @@ -1716,7 +1747,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, */ if (bind(s, &sa.sa, socklen_inany(&sa))) { if (errno != EADDRNOTAVAIL && errno != EACCES) - conn_flag(c, conn, LOCAL); + conn_flag(c, conn, LOCAL, now); } else { /* Not a local, bound destination, inconclusive test */ close(s); @@ -1728,10 +1759,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn->timer = -1; flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, TGTSIDE) < 0) { - flow_perror(flow, "Can't register with epoll"); + flow_perror_ratelimit(flow, now, "Can't register with epoll"); goto cancel; } - conn_event(c, conn, TAP_SYN_RCVD); + conn_event(c, conn, TAP_SYN_RCVD, now); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -1756,11 +1787,11 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, conn->seq_to_tap = tcp_init_seq(hash, now); conn->seq_ack_from_tap = conn->seq_to_tap; - tcp_bind_outbound(c, conn, s); + tcp_bind_outbound(c, conn, s, now); if (connect(s, &sa.sa, socklen_inany(&sa))) { if (errno != EINPROGRESS) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); goto cancel; } @@ -1768,10 +1799,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, } else { tcp_get_sndbuf(conn); - if (tcp_send_flag(c, conn, SYN | ACK)) + if (tcp_send_flag(c, conn, SYN | ACK, now)) goto cancel; - conn_event(c, conn, TAP_SYN_ACK_SENT); + conn_event(c, conn, TAP_SYN_ACK_SENT, now); } tcp_epoll_ctl(conn); @@ -1830,17 +1861,19 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp * * Return: negative on connection reset, 0 otherwise * * #syscalls recvmsg */ -static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { if (c->mode == MODE_VU) - return tcp_vu_data_from_sock(c, conn); + return tcp_vu_data_from_sock(c, conn, now); - return tcp_buf_data_from_sock(c, conn); + return tcp_buf_data_from_sock(c, conn, now); } /** @@ -1866,13 +1899,15 @@ static ssize_t tcp_packet_data_len(const struct tcphdr *th, size_t l4len) * @conn: Connection pointer * @p: Pool of TCP packets, with TCP headers * @idx: Index of first data packet in pool + * @now: Current timestamp * * #syscalls sendmsg * * Return: count of consumed packets */ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, - const struct pool *p, int idx) + const struct pool *p, int idx, + const struct timespec *now) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; @@ -1909,7 +1944,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, return -1; if (th->rst) { - tcp_sock_rst(c, conn); + tcp_sock_rst(c, conn, now); return 1; } @@ -1922,10 +1957,10 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, "keep-alive sequence: %u, previous: %u", seq, conn->seq_from_tap); - if (tcp_send_flag(c, conn, ACK)) + if (tcp_send_flag(c, conn, ACK, now)) return -1; - tcp_timer_ctl(c, conn); + tcp_timer_ctl(c, conn, now); if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE, &((int){ 1 }), sizeof(int))) @@ -1933,7 +1968,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, if (p->count == 1) { tcp_tap_window_update(c, conn, - ntohs(th->window)); + ntohs(th->window), now); return 1; } @@ -1959,7 +1994,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, * well. */ if (!ntohs(th->window)) - tcp_rewind_seq(c, conn); + tcp_rewind_seq(c, conn, now); max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; @@ -2022,19 +2057,19 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, /* On socket flush failure, pretend there was no ACK, try again later */ if (ack && !tcp_sock_consume(conn, max_ack_seq)) - tcp_update_seqack_from_tap(c, conn, max_ack_seq); + tcp_update_seqack_from_tap(c, conn, max_ack_seq, now); - tcp_tap_window_update(c, conn, max_ack_seq_wnd); + tcp_tap_window_update(c, conn, max_ack_seq_wnd, now); if (retr) { flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", conn->seq_ack_from_tap, conn->seq_to_tap); - if (tcp_rewind_seq(c, conn)) + if (tcp_rewind_seq(c, conn, now)) return -1; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, now); } if (!iov_i) @@ -2050,7 +2085,7 @@ eintr: * Then swiftly looked away and left. */ conn->seq_from_tap = seq_from_tap; - if (tcp_send_flag(c, conn, ACK)) + if (tcp_send_flag(c, conn, ACK, now)) return -1; } @@ -2058,7 +2093,7 @@ eintr: goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - if (tcp_send_flag(c, conn, ACK | DUP_ACK)) + if (tcp_send_flag(c, conn, ACK | DUP_ACK, now)) return -1; uint32_t events = tcp_conn_epoll_events(conn->events, @@ -2094,7 +2129,7 @@ out: */ if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) { conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff; - if (tcp_send_flag(c, conn, ACK | DUP_ACK)) + if (tcp_send_flag(c, conn, ACK | DUP_ACK, now)) return -1; } return p->count - idx; @@ -2102,14 +2137,14 @@ out: if (ack && conn->events & TAP_FIN_SENT && conn->seq_ack_from_tap == conn->seq_to_tap) - conn_event(c, conn, TAP_FIN_ACKED); + conn_event(c, conn, TAP_FIN_ACKED, now); if (fin && !partial_send) { conn->seq_from_tap++; - conn_event(c, conn, TAP_FIN_RCVD); + conn_event(c, conn, TAP_FIN_RCVD, now); } else { - if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) + if (tcp_send_flag(c, conn, ACK_IF_NEEDED, now)) return -1; } @@ -2123,13 +2158,15 @@ out: * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length + * @now: Current timestamp */ static void tcp_conn_from_sock_finish(const struct ctx *c, struct tcp_tap_conn *conn, const struct tcphdr *th, - const char *opts, size_t optlen) + const char *opts, size_t optlen, + const struct timespec *now) { - tcp_tap_window_update(c, conn, ntohs(th->window)); + tcp_tap_window_update(c, conn, ntohs(th->window), now); tcp_get_tap_ws(conn, opts, optlen); /* First value is not scaled */ @@ -2142,21 +2179,21 @@ static void tcp_conn_from_sock_finish(const struct ctx *c, conn->seq_from_tap = conn->seq_init_from_tap; conn->seq_ack_to_tap = conn->seq_from_tap; - conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn, 0)) { - tcp_rst(c, conn); + conn_event(c, conn, ESTABLISHED, now); + if (tcp_set_peek_offset(conn, 0, now)) { + tcp_rst(c, conn, now); return; } - if (tcp_send_flag(c, conn, ACK)) { - tcp_rst(c, conn); + if (tcp_send_flag(c, conn, ACK, now)) { + tcp_rst(c, conn, now); return; } /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, now); } /** @@ -2297,7 +2334,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, flow_trace(conn, "packet length %zu from tap", l4len); if (th->rst) { - tcp_sock_rst(c, conn); + tcp_sock_rst(c, conn, now); return 1; } @@ -2305,12 +2342,13 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, conn->tap_inactive = false; if (th->ack && !(conn->events & ESTABLISHED)) - tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq), now); /* Establishing connection from socket */ if (conn->events & SOCK_ACCEPTED) { if (th->syn && th->ack && !th->fin) { - tcp_conn_from_sock_finish(c, conn, th, opts, optlen); + tcp_conn_from_sock_finish(c, conn, th, opts, optlen, + now); return 1; } @@ -2325,22 +2363,23 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (!(conn->events & TAP_SYN_ACK_SENT)) goto reset; - conn_event(c, conn, ESTABLISHED); - if (tcp_set_peek_offset(conn, 0)) + conn_event(c, conn, ESTABLISHED, now); + if (tcp_set_peek_offset(conn, 0, now)) goto reset; if (th->fin) { conn->seq_from_tap++; if (shutdown(conn->sock, SHUT_WR) < 0) { - flow_dbg_perror(conn, "shutdown() failed"); + flow_warn_perror_ratelimit(conn, now, + "shutdown() failed"); goto reset; } - if (tcp_send_flag(c, conn, ACK)) + if (tcp_send_flag(c, conn, ACK, now)) goto reset; - conn_event(c, conn, SOCK_FIN_SENT); + conn_event(c, conn, SOCK_FIN_SENT, now); return 1; } @@ -2348,8 +2387,8 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, if (!th->ack) goto reset; - if (tcp_tap_window_update(c, conn, ntohs(th->window))) - tcp_data_from_sock(c, conn); + if (tcp_tap_window_update(c, conn, ntohs(th->window), now)) + tcp_data_from_sock(c, conn, now); if (p->count - idx == 1) return 1; @@ -2373,38 +2412,40 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, * later */ if (th->ack && !tcp_sock_consume(conn, ntohl(th->ack_seq))) - tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq)); + tcp_update_seqack_from_tap(c, conn, ntohl(th->ack_seq), + now); if (retr) { flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", ntohl(th->ack_seq), conn->seq_to_tap); - if (tcp_rewind_seq(c, conn)) + if (tcp_rewind_seq(c, conn, now)) return -1; } - if (tcp_tap_window_update(c, conn, ntohs(th->window)) || retr) - tcp_data_from_sock(c, conn); + if (tcp_tap_window_update(c, conn, ntohs(th->window), now) || + retr) + tcp_data_from_sock(c, conn, now); if (conn->seq_ack_from_tap == conn->seq_to_tap) { if (th->ack && conn->events & TAP_FIN_SENT) - conn_event(c, conn, TAP_FIN_ACKED); + conn_event(c, conn, TAP_FIN_ACKED, now); if (conn->events & SOCK_FIN_RCVD && conn->events & TAP_FIN_ACKED) - conn_event(c, conn, CLOSED); + conn_event(c, conn, CLOSED, now); } return 1; } /* Established connections accepting data from tap */ - count = tcp_data_from_tap(c, conn, p, idx); + count = tcp_data_from_tap(c, conn, p, idx, now); if (count == -1) goto reset; - conn_flag(c, conn, ~STALLED); + conn_flag(c, conn, ~STALLED, now); if (conn->seq_ack_to_tap != conn->seq_from_tap) ack_due = 1; @@ -2414,12 +2455,13 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, struct tcp_info tinfo; if (shutdown(conn->sock, SHUT_WR) < 0) { - flow_dbg_perror(conn, "shutdown() failed"); + flow_warn_perror_ratelimit(conn, now, + "shutdown() failed"); goto reset; } - conn_event(c, conn, SOCK_FIN_SENT); - if (tcp_send_flag(c, conn, ACK)) + conn_event(c, conn, SOCK_FIN_SENT, now); + if (tcp_send_flag(c, conn, ACK, now)) goto reset; ack_due = 0; @@ -2440,7 +2482,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, } if (ack_due) - conn_flag(c, conn, ACK_TO_TAP_DUE); + conn_flag(c, conn, ACK_TO_TAP_DUE, now); return count; @@ -2449,7 +2491,7 @@ reset: * remaining packets in the batch, since they'd be invalidated when our * RST is received, even if otherwise good. */ - tcp_rst(c, conn); + tcp_rst(c, conn, now); return p->count - idx; } @@ -2457,25 +2499,27 @@ reset: * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp */ -static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { socklen_t sl; int so; sl = sizeof(so); if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return; } - if (tcp_send_flag(c, conn, SYN | ACK)) { - tcp_rst(c, conn); + if (tcp_send_flag(c, conn, SYN | ACK, now)) { + tcp_rst(c, conn, now); return; } - conn_event(c, conn, TAP_SYN_ACK_SENT); - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_event(c, conn, TAP_SYN_ACK_SENT, now); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); } /** @@ -2498,13 +2542,13 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT); if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, s, INISIDE) < 0) { - flow_perror(flow, "Can't register with epoll"); - conn_flag(c, conn, CLOSING); + flow_perror_ratelimit(flow, now, "Can't register with epoll"); + conn_flag(c, conn, CLOSING, now); FLOW_ACTIVATE(conn); return; } - conn_event(c, conn, SOCK_ACCEPTED); + conn_event(c, conn, SOCK_ACCEPTED, now); hash = flow_hash_insert(c, TAP_SIDX(conn)); conn->seq_to_tap = tcp_init_seq(hash, now); @@ -2513,13 +2557,13 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, conn->wnd_from_tap = WINDOW_DEFAULT; - if (tcp_send_flag(c, conn, SYN)) { - conn_flag(c, conn, CLOSING); + if (tcp_send_flag(c, conn, SYN, now)) { + conn_flag(c, conn, CLOSING, now); FLOW_ACTIVATE(conn); return; } - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); tcp_get_sndbuf(conn); @@ -2577,7 +2621,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, switch (flow->f.pif[TGTSIDE]) { case PIF_SPLICE: case PIF_HOST: - tcp_splice_conn_from_sock(c, flow, s); + tcp_splice_conn_from_sock(c, flow, s, now); break; case PIF_TAP: @@ -2585,9 +2629,11 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, break; default: - flow_err(flow, "No support for forwarding TCP from %s to %s", - pif_name(flow->f.pif[INISIDE]), - pif_name(flow->f.pif[TGTSIDE])); + flow_err_ratelimit( + flow, now, + "No support for forwarding TCP from %s to %s", + pif_name(flow->f.pif[INISIDE]), + pif_name(flow->f.pif[TGTSIDE])); goto rst; } @@ -2604,12 +2650,14 @@ cancel: * tcp_timer_handler() - timerfd events: close, send ACK, retransmit, or reset * @c: Execution context * @ref: epoll reference of timer (not connection) + * @now: Current timestamp * * #syscalls timerfd_gettime|timerfd_gettime64 * #syscalls arm:timerfd_gettime64 i686:timerfd_gettime64 * #syscalls arm:timerfd_settime64 i686:timerfd_settime64 */ -void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref, + const struct timespec *now) { struct itimerspec check_armed = { { 0 }, { 0 } }; struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; @@ -2622,17 +2670,17 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * and we just set the timer to a new point in the future: discard it. */ if (timerfd_gettime(conn->timer, &check_armed)) - flow_perror(conn, "failed to read timer"); + flow_perror_ratelimit(conn, now, "failed to read timer"); if (check_armed.it_value.tv_sec || check_armed.it_value.tv_nsec) return; if (conn->flags & ACK_TO_TAP_DUE) { - if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) { - tcp_rst(c, conn); + if (tcp_send_flag(c, conn, ACK_IF_NEEDED, now)) { + tcp_rst(c, conn, now); return; } - tcp_timer_ctl(c, conn); + tcp_timer_ctl(c, conn, now); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { unsigned int max; @@ -2641,20 +2689,20 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) max = MIN(TCP_MAX_RETRIES, max); if (conn->retries >= max) { flow_dbg(conn, "handshake timeout"); - tcp_rst(c, conn); + tcp_rst(c, conn, now); } else { flow_trace(conn, "SYN timeout, retry"); - if (tcp_send_flag(c, conn, SYN)) { - tcp_rst(c, conn); + if (tcp_send_flag(c, conn, SYN, now)) { + tcp_rst(c, conn, now); return; } conn->retries++; - conn_flag(c, conn, SYN_RETRIED); - tcp_timer_ctl(c, conn); + conn_flag(c, conn, SYN_RETRIED, now); + tcp_timer_ctl(c, conn, now); } } else if (conn->retries == TCP_MAX_RETRIES) { flow_dbg(conn, "retransmissions count exceeded"); - tcp_rst(c, conn); + tcp_rst(c, conn, now); } else { flow_dbg(conn, "ACK timeout, retry"); @@ -2662,11 +2710,11 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) conn->wnd_from_tap = 1; /* Zero-window probe */ conn->retries++; - if (tcp_rewind_seq(c, conn)) + if (tcp_rewind_seq(c, conn, now)) return; - tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); + tcp_data_from_sock(c, conn, now); + tcp_timer_ctl(c, conn, now); } } } @@ -2676,9 +2724,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap + * @now: Current timestamp */ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events) + uint32_t events, const struct timespec *now) { struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); @@ -2689,32 +2738,32 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, return; if (events & EPOLLERR) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return; } conn->inactive = false; if ((conn->events & TAP_FIN_ACKED) && (events & EPOLLHUP)) { - conn_event(c, conn, CLOSED); + conn_event(c, conn, CLOSED, now); return; } if (conn->events & ESTABLISHED) { if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - conn_event(c, conn, CLOSED); + conn_event(c, conn, CLOSED, now); if (events & (EPOLLRDHUP | EPOLLHUP)) - conn_event(c, conn, SOCK_FIN_RCVD); + conn_event(c, conn, SOCK_FIN_RCVD, now); if (events & EPOLLIN) - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, now); if (events & EPOLLOUT) { tcp_epoll_ctl(conn); - if (tcp_update_seqack_wnd(c, conn, false, NULL) && - tcp_send_flag(c, conn, ACK)) { - tcp_rst(c, conn); + if (tcp_update_seqack_wnd(c, conn, false, NULL, now) && + tcp_send_flag(c, conn, ACK, now)) { + tcp_rst(c, conn, now); return; } } @@ -2724,7 +2773,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, /* EPOLLHUP during handshake: reset */ if (events & EPOLLHUP) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return; } @@ -2734,7 +2783,7 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, if (conn->events == TAP_SYN_RCVD) { if (events & EPOLLOUT) - tcp_connect_finish(c, conn); + tcp_connect_finish(c, conn, now); /* Data? Check later */ } } @@ -2956,8 +3005,8 @@ static void tcp_keepalive(struct ctx *c, const struct timespec *now) if (conn->tap_inactive) { flow_dbg(conn, "No tap activity for least %us, send keepalive", KEEPALIVE_INTERVAL); - if (tcp_send_flag(c, conn, KEEPALIVE)) - tcp_rst(c, conn); + if (tcp_send_flag(c, conn, KEEPALIVE, now)) + tcp_rst(c, conn, now); } /* Ready to check fot next interval */ @@ -2986,7 +3035,7 @@ static void tcp_inactivity(struct ctx *c, const struct timespec *now) /* No activity in this interval, reset */ flow_dbg(conn, "Inactive for at least %us, resetting", INACTIVITY_INTERVAL); - tcp_rst(c, conn); + tcp_rst(c, conn, now); } /* Ready to check fot next interval */ @@ -3002,7 +3051,7 @@ static void tcp_inactivity(struct ctx *c, const struct timespec *now) /* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c, const struct timespec *now) { - tcp_payload_flush(c); + tcp_payload_flush(c, now); if (timespec_diff_ms(now, &c->tcp.timer_run) < TCP_TIMER_INTERVAL) return; @@ -3549,11 +3598,13 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn) * @c: Execution context * @fd: Descriptor for state migration * @conn: Pointer to the TCP connection structure + * @now: Current timesstamp * * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure */ int tcp_flow_migrate_source_ext(const struct ctx *c, - int fd, const struct tcp_tap_conn *conn) + int fd, const struct tcp_tap_conn *conn, + const struct timespec *now) { uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)]; @@ -3563,7 +3614,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c, /* Disable SO_PEEK_OFF, it will make accessing the queues in repair mode * weird. */ - if (tcp_set_peek_offset(conn, -1)) { + if (tcp_set_peek_offset(conn, -1, now)) { rc = -errno; goto fail; } @@ -3828,10 +3879,12 @@ out: * @c: Execution context * @conn: Connection entry to complete with extra data * @fd: Descriptor for state migration + * @now: Current timestamp * * Return: 0 on success, negative on fatal failure, but 0 on single flow failure */ -int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd) +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, + int fd, const struct timespec *now) { uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; struct tcp_tap_transfer_ext t; @@ -3985,13 +4038,13 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd } } - if (tcp_set_peek_offset(conn, peek_offset)) + if (tcp_set_peek_offset(conn, peek_offset, now)) goto fail; - if (tcp_send_flag(c, conn, ACK)) + if (tcp_send_flag(c, conn, ACK, now)) goto fail; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, now); if ((rc = tcp_epoll_ctl(conn))) { flow_dbg(conn, @@ -4009,7 +4062,7 @@ fail: } conn->flags = 0; /* Not waiting for ACK, don't schedule timer */ - tcp_rst(c, conn); + tcp_rst(c, conn, now); return 0; } @@ -18,11 +18,12 @@ struct ctx; -void tcp_timer_handler(const struct ctx *c, union epoll_ref ref); +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref, + const struct timespec *now); void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, const struct timespec *now); void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, - uint32_t events); + uint32_t events, const struct timespec *now); int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, uint32_t flow_lbl, const struct pool *p, int idx, const struct timespec *now); @@ -106,9 +106,11 @@ void tcp_sock_iov_init(const struct ctx *c) * @conns: Array of connection pointers corresponding to queued frames * @frames: Two-dimensional array containing queued frames with sub-iovs * @num_frames: Number of entries in the two arrays to be compared + * @now: Current timestamp */ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, - struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) + struct iovec (*frames)[TCP_NUM_IOVS], int num_frames, + const struct timespec *now) { int i; @@ -123,16 +125,17 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, conn->seq_to_tap = seq; peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap; - if (tcp_set_peek_offset(conn, peek_offset)) - tcp_rst(c, conn); + if (tcp_set_peek_offset(conn, peek_offset, now)) + tcp_rst(c, conn, now); } } /** * tcp_payload_flush() - Send out buffers for segments with data or flags * @c: Execution context + * @now: Current timestamp */ -void tcp_payload_flush(const struct ctx *c) +void tcp_payload_flush(const struct ctx *c, const struct timespec *now) { size_t m; @@ -140,7 +143,7 @@ void tcp_payload_flush(const struct ctx *c) tcp_payload_used); if (m != tcp_payload_used) { tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], - tcp_payload_used - m); + tcp_payload_used - m, now); } tcp_payload_used = 0; } @@ -198,13 +201,15 @@ static void tcp_l2_buf_fill_headers(const struct ctx *c, /** * tcp_buf_send_flag() - Send segment with flags to tap (no payload) - * @c: Execution context - * @conn: Connection pointer - * @flags: TCP flags: if not set, send segment only if ACK is due + * @c: Execution context + * @conn: Connection pointer + * @flags: TCP flags: if not set, send segment only if ACK is due + * @now: Current timestamp * * Return: negative error code on connection reset, 0 otherwise */ -int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags, + const struct timespec *now) { struct tcp_payload_t *payload; struct iovec *iov; @@ -223,7 +228,8 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) payload = iov[TCP_IOV_PAYLOAD].iov_base; seq = conn->seq_to_tap; ret = tcp_prepare_flags(c, conn, flags, &payload->th, - (struct tcp_syn_opts *)&payload->data, &optlen); + (struct tcp_syn_opts *)&payload->data, + &optlen, now); if (ret <= 0) return ret; @@ -253,7 +259,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) } if (tcp_payload_used > TCP_FRAMES_MEM - 2) - tcp_payload_flush(c); + tcp_payload_flush(c, now); return 0; } @@ -266,9 +272,11 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @seq: Sequence number to be sent * @push: Set PSH flag, last segment in a batch + * @now: Current timestamp */ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, - ssize_t dlen, int no_csum, uint32_t seq, bool push) + ssize_t dlen, int no_csum, uint32_t seq, bool push, + const struct timespec *now) { struct tcp_payload_t *payload; uint32_t check = IP4_CSUM; @@ -302,19 +310,21 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, tcp_l2_buf_pad(iov); if (++tcp_payload_used > TCP_FRAMES_MEM - 1) - tcp_payload_flush(c); + tcp_payload_flush(c, now); } /** * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp * * Return: negative on connection reset, 0 otherwise * * #syscalls recvmsg */ -int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; @@ -333,16 +343,16 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; - if (tcp_set_peek_offset(conn, 0)) { - tcp_rst(c, conn); + if (tcp_set_peek_offset(conn, 0, now)) { + tcp_rst(c, conn, now); return -1; } } if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, ACK_FROM_TAP_BLOCKS); - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_BLOCKS, now); + conn_flag(c, conn, STALLED, now); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); return 0; } @@ -356,12 +366,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return -1; } if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { - tcp_payload_flush(c); + tcp_payload_flush(c, now); /* Silence Coverity CWE-125 false positive */ tcp_payload_used = 0; @@ -381,19 +391,19 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) if (len < 0) { if (errno != EAGAIN && errno != EWOULDBLOCK) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return -errno; } if (already_sent) /* No new data and EAGAIN: set EPOLLET */ - conn_flag(c, conn, STALLED); + conn_flag(c, conn, STALLED, now); return 0; } if (!len) { if (already_sent) { - conn_flag(c, conn, STALLED); + conn_flag(c, conn, STALLED, now); } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { int ret; @@ -408,14 +418,14 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) */ conn->seq_ack_to_tap = conn->seq_from_tap; - ret = tcp_buf_send_flag(c, conn, FIN | ACK); + ret = tcp_buf_send_flag(c, conn, FIN | ACK, now); if (ret) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return ret; } - conn_event(c, conn, TAP_FIN_SENT); - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_event(c, conn, TAP_FIN_SENT, now); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); } return 0; @@ -425,18 +435,18 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) len -= already_sent; if (len <= 0) { - conn_flag(c, conn, STALLED); + conn_flag(c, conn, STALLED, now); return 0; } - conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS); - conn_flag(c, conn, ~STALLED); + conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS, now); + conn_flag(c, conn, ~STALLED, now); send_bufs = DIV_ROUND_UP(len, mss); last_len = len - (send_bufs - 1) * mss; /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, false, NULL); + tcp_update_seqack_wnd(c, conn, false, NULL, now); /* Finally, queue to tap */ dlen = mss; @@ -450,11 +460,11 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) push = true; } - tcp_data_to_tap(c, conn, dlen, no_csum, seq, push); + tcp_data_to_tap(c, conn, dlen, no_csum, seq, push, now); seq += dlen; } - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); return 0; } @@ -7,8 +7,10 @@ #define TCP_BUF_H void tcp_sock_iov_init(const struct ctx *c); -void tcp_payload_flush(const struct ctx *c); -int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); -int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); +void tcp_payload_flush(const struct ctx *c, const struct timespec *now); +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now); +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags, + const struct timespec *now); #endif /*TCP_BUF_H */ @@ -248,11 +248,13 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn); int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn); -int tcp_flow_migrate_source_ext(const struct ctx *c, int fd, - const struct tcp_tap_conn *conn); +int tcp_flow_migrate_source_ext(const struct ctx *c, + int fd, const struct tcp_tap_conn *conn, + const struct timespec *now); int tcp_flow_migrate_target(struct ctx *c, int fd); -int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd); +int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, + int fd, const struct timespec *now); bool tcp_flow_is_established(const struct tcp_tap_conn *conn); diff --git a/tcp_internal.h b/tcp_internal.h index 40472c9..169b3d3 100644 --- a/tcp_internal.h +++ b/tcp_internal.h @@ -158,27 +158,28 @@ struct tcp_syn_opts { extern char tcp_buf_discard [BUF_DISCARD_SIZE]; void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long flag); -#define conn_flag(c, conn, flag) \ + unsigned long flag, const struct timespec *now); +#define conn_flag(c, conn, flag, now) \ do { \ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ + conn_flag_do(c, conn, flag, now); \ } while (0) void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, - unsigned long event); -#define conn_event(c, conn, event) \ + unsigned long event, const struct timespec *now); +#define conn_event(c, conn, event, now) \ do { \ flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ - conn_event_do(c, conn, event); \ + conn_event_do(c, conn, event, now); \ } while (0) -void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); -#define tcp_rst(c, conn) \ +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now); +#define tcp_rst(c, conn, now) \ do { \ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ - tcp_rst_do(c, conn); \ + tcp_rst_do(c, conn, now); \ } while (0) struct tcp_info_linux; @@ -194,11 +195,13 @@ size_t tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn, size_t dlen, uint32_t csum_flags, uint32_t seq); int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, - bool force_seq, struct tcp_info_linux *tinfo); + bool force_seq, struct tcp_info_linux *tinfo, + const struct timespec *now); int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, struct tcp_syn_opts *opts, - size_t *optlen); -int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset); + size_t *optlen, const struct timespec *now); +int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset, + const struct timespec *now); int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov, uint32_t already_sent, int payload_iov_cnt); diff --git a/tcp_splice.c b/tcp_splice.c index 3fd33a1..4b01f1a 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -136,10 +136,12 @@ static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) /** * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events * @conn: Connection pointer + * @now: Current timestamp * * Return: 0 on success, negative error code on failure (not on deletion) */ -static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn) +static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn, + const struct timespec *now) { uint32_t events[2]; @@ -149,7 +151,7 @@ static int tcp_splice_epoll_ctl(struct tcp_splice_conn *conn) if (flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events[0], conn->s[0], 0) || flow_epoll_set(&conn->f, EPOLL_CTL_MOD, events[1], conn->s[1], 1)) { int ret = -errno; - flow_perror(conn, "ERROR on epoll_ctl()"); + flow_perror_ratelimit(conn, now, "ERROR on epoll_ctl()"); return ret; } @@ -201,8 +203,10 @@ static void conn_flag_do(struct tcp_splice_conn *conn, * conn_event_do() - Set and log connection events, update epoll state * @conn: Connection pointer * @event: Connection event + * @now: Current timestamp */ -static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event) +static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event, + const struct timespec *now) { if (event & (event - 1)) { int flag_index = fls(~event); @@ -224,14 +228,14 @@ static void conn_event_do(struct tcp_splice_conn *conn, unsigned long event) flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]); } - if (tcp_splice_epoll_ctl(conn)) + if (tcp_splice_epoll_ctl(conn, now)) conn_flag(conn, CLOSING); } -#define conn_event(conn, event) \ +#define conn_event(conn, event, now) \ do { \ flow_trace(conn, "event at %s:%i",__func__, __LINE__); \ - conn_event_do(conn, event); \ + conn_event_do(conn, event, now); \ } while (0) /** @@ -292,11 +296,13 @@ bool tcp_splice_flow_defer(struct tcp_splice_conn *conn) * tcp_splice_connect_finish() - Completion of connect() or call on success * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp * * Return: 0 on success, -EIO on failure */ static int tcp_splice_connect_finish(const struct ctx *c, - struct tcp_splice_conn *conn) + struct tcp_splice_conn *conn, + const struct timespec *now) { unsigned sidei; int i = 0; @@ -314,8 +320,10 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (conn->pipe[sidei][0] < 0) { if (pipe2(conn->pipe[sidei], O_NONBLOCK | O_CLOEXEC)) { - flow_perror(conn, "cannot create %d->%d pipe", - sidei, !sidei); + flow_perror_ratelimit( + conn, now, + "cannot create %d->%d pipe", + sidei, !sidei); tcp_splice_rst(conn); return -EIO; } @@ -330,7 +338,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, } if (!(conn->events & SPLICE_ESTABLISHED)) - conn_event(conn, SPLICE_ESTABLISHED); + conn_event(conn, SPLICE_ESTABLISHED, now); return 0; } @@ -339,10 +347,12 @@ static int tcp_splice_connect_finish(const struct ctx *c, * tcp_splice_connect() - Create and connect socket for new spliced connection * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp * * Return: 0 for connect() succeeded or in progress, negative value on error */ -static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) +static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, + const struct timespec *now) { const struct flowside *tgt = &conn->f.side[TGTSIDE]; sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6; @@ -381,11 +391,11 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) if (flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[0], 0) || flow_epoll_set(&conn->f, EPOLL_CTL_ADD, 0, conn->s[1], 1)) { int ret = -errno; - flow_perror(conn, "Cannot register to epollfd"); + flow_perror_ratelimit(conn, now, "Cannot register to epollfd"); return ret; } - conn_event(conn, SPLICE_CONNECT); + conn_event(conn, SPLICE_CONNECT, now); if (connect(conn->s[1], &sa.sa, socklen_inany(&sa))) { if (errno != EINPROGRESS) { @@ -394,8 +404,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) return -errno; } } else { - conn_event(conn, SPLICE_ESTABLISHED); - return tcp_splice_connect_finish(c, conn); + conn_event(conn, SPLICE_ESTABLISHED, now); + return tcp_splice_connect_finish(c, conn, now); } return 0; @@ -435,10 +445,12 @@ static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) * @flow: flow to initialise * @s0: Accepted (side 0) socket * @sa: Peer address of connection + * @now: Current timestamp * * #syscalls:pasta setsockopt */ -void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0, + const struct timespec *now) { struct tcp_splice_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice); @@ -453,7 +465,7 @@ void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0) if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); - if (tcp_splice_connect(c, conn)) + if (tcp_splice_connect(c, conn, now)) tcp_splice_rst(conn); FLOW_ACTIVATE(conn); @@ -499,7 +511,7 @@ static int tcp_splice_forward(struct ctx *c, if (readlen <= 0) { if (!readlen) /* EOF */ - conn_event(conn, FIN_RCVD(fromsidei)); + conn_event(conn, FIN_RCVD(fromsidei), now); /* We're either blocked or at EOF on the read side, and * there's nothing in the pipe so there's nothing to do @@ -551,9 +563,9 @@ static int tcp_splice_forward(struct ctx *c, * drain. */ if (conn->pending[fromsidei]) - conn_event(conn, OUT_WAIT(!fromsidei)); + conn_event(conn, OUT_WAIT(!fromsidei), now); else - conn_event(conn, ~OUT_WAIT(!fromsidei)); + conn_event(conn, ~OUT_WAIT(!fromsidei), now); if ((conn->events & FIN_RCVD(fromsidei)) && !(conn->events & FIN_SENT(!fromsidei)) && @@ -563,7 +575,7 @@ static int tcp_splice_forward(struct ctx *c, pif_name(conn->f.pif[!fromsidei])); return -1; } - conn_event(conn, FIN_SENT(!fromsidei)); + conn_event(conn, FIN_SENT(!fromsidei), now); } return 0; @@ -593,7 +605,8 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); if (rc) - flow_perror(conn, "Error retrieving SO_ERROR"); + flow_perror_ratelimit(conn, now, + "Error retrieving SO_ERROR"); else flow_dbg_ratelimit(conn, now, "Error event on %s socket: %s", @@ -610,7 +623,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, events); goto reset; } - if (tcp_splice_connect_finish(c, conn)) + if (tcp_splice_connect_finish(c, conn, now)) goto reset; } diff --git a/tcp_splice.h b/tcp_splice.h index 8a1a1f6..c24e419 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -13,7 +13,8 @@ union sockaddr_inany; void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0); +void tcp_splice_conn_from_sock(const struct ctx *c, union flow *flow, int s0, + const struct timespec *now); void tcp_splice_init(struct ctx *c); #endif /* TCP_SPLICE_H */ @@ -116,12 +116,14 @@ static int tcp_vu_send_dup(const struct ctx *c, struct vu_virtq *vq, * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due + * @now: Current timestamp * * Return: -ECONNRESET on fatal connection error, * -EAGAIN if vhost-user buffers are unavailable, * 0 otherwise */ -int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) +int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags, + const struct timespec *now) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; @@ -158,7 +160,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); seq = conn->seq_to_tap; - ret = tcp_prepare_flags(c, conn, flags, &th, &opts, &optlen); + ret = tcp_prepare_flags(c, conn, flags, &th, &opts, &optlen, now); if (ret <= 0) { vu_queue_rewind(vq, elem_cnt); return ret; @@ -423,10 +425,12 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, * in window * @c: Execution context * @conn: Connection pointer + * @now: Current timestamp * * Return: negative on connection reset, 0 otherwise */ -int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) +int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; struct vu_dev *vdev = c->vdev; @@ -450,16 +454,16 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; - if (tcp_set_peek_offset(conn, 0)) { - tcp_rst(c, conn); + if (tcp_set_peek_offset(conn, 0, now)) { + tcp_rst(c, conn, now); return -1; } } if (!wnd_scaled || already_sent >= wnd_scaled) { - conn_flag(c, conn, ACK_FROM_TAP_BLOCKS); - conn_flag(c, conn, STALLED); - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_BLOCKS, now); + conn_flag(c, conn, STALLED, now); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); return 0; } @@ -474,12 +478,12 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) &elem_cnt, &frame_cnt); if (len < 0) { if (len != -EAGAIN && len != -EWOULDBLOCK) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return len; } if (already_sent) /* No new data and EAGAIN: set EPOLLET */ - conn_flag(c, conn, STALLED); + conn_flag(c, conn, STALLED, now); return 0; } @@ -487,7 +491,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) if (!len) { vu_queue_rewind(vq, elem_cnt); if (already_sent) { - conn_flag(c, conn, STALLED); + conn_flag(c, conn, STALLED, now); } else if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { int ret; @@ -495,24 +499,24 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) /* See tcp_buf_data_from_sock() */ conn->seq_ack_to_tap = conn->seq_from_tap; - ret = tcp_vu_send_flag(c, conn, FIN | ACK); + ret = tcp_vu_send_flag(c, conn, FIN | ACK, now); if (ret) { - tcp_rst(c, conn); + tcp_rst(c, conn, now); return ret; } - conn_event(c, conn, TAP_FIN_SENT); - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_event(c, conn, TAP_FIN_SENT, now); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); } return 0; } - conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS); - conn_flag(c, conn, ~STALLED); + conn_flag(c, conn, ~ACK_FROM_TAP_BLOCKS, now); + conn_flag(c, conn, ~STALLED, now); /* Likely, some new data was acked too. */ - tcp_update_seqack_wnd(c, conn, false, NULL); + tcp_update_seqack_wnd(c, conn, false, NULL, now); /* initialize headers */ /* iov_vu is an array of buffers and the buffer size can be @@ -556,7 +560,7 @@ int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) } vu_queue_notify(vdev, vq); - conn_flag(c, conn, ACK_FROM_TAP_DUE); + conn_flag(c, conn, ACK_FROM_TAP_DUE, now); return 0; } @@ -6,7 +6,9 @@ #ifndef TCP_VU_H #define TCP_VU_H -int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); -int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); +int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags, + const struct timespec *now); +int tcp_vu_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn, + const struct timespec *now); #endif /*TCP_VU_H */ @@ -539,6 +539,7 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) * @pif: Interface on which the error occurred * (only used if @sidx == FLOW_SIDX_NONE) * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) + * @now: Current timestamp * * Return: 1 if error received and processed, 0 if no more errors in queue, < 0 * if there was an error reading the queue @@ -546,7 +547,8 @@ static int udp_pktinfo(struct msghdr *msg, union inany_addr *dst) * #syscalls recvmsg */ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, - uint8_t pif, in_port_t port) + uint8_t pif, in_port_t port, + const struct timespec *now) { char buf[PKTINFO_SPACE + RECVERR_SPACE]; const struct sock_extended_err *ee; @@ -664,12 +666,14 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx, } fail: - flow_dbg(uflow, "Can't propagate %s error from %s %s to %s %s", - str_ee_origin(ee), - pif_name(pif), - sockaddr_ntop(SO_EE_OFFENDER(ee), sastr, sizeof(sastr)), - pif_name(topif), - inany_ntop(&toside->eaddr, astr, sizeof(astr))); + flow_warn_ratelimit(uflow, now, + "Can't propagate %s error from %s %s to %s %s", + str_ee_origin(ee), + pif_name(pif), + sockaddr_ntop(SO_EE_OFFENDER(ee), + sastr, sizeof(sastr)), + pif_name(topif), + inany_ntop(&toside->eaddr, astr, sizeof(astr))); return 1; } @@ -681,11 +685,13 @@ fail: * @pif: Interface on which the error occurred * (only used if @sidx == FLOW_SIDX_NONE) * @port: Local port number of @s (only used if @sidx == FLOW_SIDX_NONE) + * @now: Current timestamp * * Return: number of errors handled, or < 0 if we have an unrecoverable error */ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx, - uint8_t pif, in_port_t port) + uint8_t pif, in_port_t port, + const struct timespec *now) { unsigned n_err = 0; socklen_t errlen; @@ -694,7 +700,7 @@ static int udp_sock_errs(const struct ctx *c, int s, flow_sidx_t sidx, assert(!c->no_udp); /* Empty the error queue */ - while ((rc = udp_sock_recverr(c, s, sidx, pif, port)) > 0) + while ((rc = udp_sock_recverr(c, s, sidx, pif, port, now)) > 0) n_err += rc; if (rc < 0) @@ -869,7 +875,7 @@ void udp_sock_fwd(const struct ctx *c, int s, int rule_hint, strerror_(-rc)); /* Clear errors & carry on */ if (udp_sock_errs(c, s, FLOW_SIDX_NONE, - frompif, port) < 0) { + frompif, port, now) < 0) { err_ratelimit(now, "UDP: Unrecoverable error on listening socket: (%s port %hu)", pif_name(frompif), port); @@ -892,9 +898,10 @@ void udp_sock_fwd(const struct ctx *c, int s, int rule_hint, } else if (flow_sidx_valid(tosidx)) { struct udp_flow *uflow = udp_at_sidx(tosidx); - flow_err(uflow, - "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); + flow_err_ratelimit( + uflow, now, + "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); discard = true; } else { warn_ratelimit(now, "Discarding datagram without flow"); @@ -942,8 +949,11 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, assert(!c->no_udp && uflow); if (events & EPOLLERR) { - if (udp_sock_errs(c, ref.fd, ref.flowside, PIF_NONE, 0) < 0) { - flow_err(uflow, "Unrecoverable error on flow socket"); + if (udp_sock_errs(c, ref.fd, ref.flowside, + PIF_NONE, 0, now) < 0) { + flow_err_ratelimit( + uflow, now, + "Unrecoverable error on flow socket"); goto fail; } } @@ -974,7 +984,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, udp_buf_sock_to_tap(c, s, n, tosidx); } } else { - flow_err(uflow, + flow_err_ratelimit(uflow, now, "No support for forwarding UDP from %s to %s", pif_name(pif_at_sidx(ref.flowside)), pif_name(topif)); @@ -1052,8 +1062,9 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, flow_sidx_t fromsidx = flow_sidx_opposite(tosidx); uint8_t frompif = pif_at_sidx(fromsidx); - flow_err(uflow, "No support for forwarding UDP from %s to %s", - pif_name(frompif), pif_name(topif)); + flow_err_ratelimit(uflow, now, + "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); return 1; } toside = flowside_at_sidx(tosidx); @@ -64,11 +64,13 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow) * @c: Execution context * @uflow: UDP flow to open socket for * @sidei: Side of @uflow to open socket for + * @now: Current timestamp * * Return: fd of new socket on success, -ve error code on failure */ static int udp_flow_sock(const struct ctx *c, - struct udp_flow *uflow, unsigned sidei) + struct udp_flow *uflow, unsigned sidei, + const struct timespec *now) { const struct flowside *side = &uflow->f.side[sidei]; uint8_t pif = uflow->f.pif[sidei]; @@ -77,7 +79,8 @@ static int udp_flow_sock(const struct ctx *c, s = flowside_sock_l4(c, EPOLL_TYPE_UDP, pif, side); if (s < 0) { - flow_dbg_perror(uflow, "Couldn't open flow specific socket"); + flow_perror_ratelimit(uflow, now, + "Couldn't open flow specific socket"); return s; } @@ -92,8 +95,9 @@ static int udp_flow_sock(const struct ctx *c, epoll_del(flow_epollfd(&uflow->f), s); close(s); - flow_dbg(uflow, "Couldn't connect flow socket: %s", - strerror_(-rc)); + flow_warn_ratelimit(uflow, now, + "Couldn't connect flow socket: %s", + strerror_(-rc)); return rc; } uflow->s[sidei] = s; @@ -154,7 +158,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, flow_foreach_sidei(sidei) { if (pif_is_socket(uflow->f.pif[sidei])) - if (udp_flow_sock(c, uflow, sidei) < 0) + if (udp_flow_sock(c, uflow, sidei, now) < 0) goto cancel; } @@ -176,7 +180,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, goto cancel; } if (port != tgt->oport) { - flow_err(uflow, "Unexpected local port"); + flow_err_ratelimit(uflow, now, "Unexpected local port"); goto cancel; } } @@ -248,7 +252,8 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif, * been initiated from a socket bound to 0.0.0.0 or ::, we don't * know our address, so we have to leave it unpopulated. */ - flow_err(flow, "Invalid endpoint on UDP recvfrom()"); + flow_err_ratelimit(flow, now, + "Invalid endpoint on UDP recvfrom()"); flow_alloc_cancel(flow); return FLOW_SIDX_NONE; } |
