diff options
-rw-r--r-- | Makefile | 12 | ||||
-rw-r--r-- | epoll_type.h | 2 | ||||
-rw-r--r-- | migrate.c | 214 | ||||
-rw-r--r-- | migrate.h | 51 | ||||
-rw-r--r-- | passt.c | 8 | ||||
-rw-r--r-- | passt.h | 8 | ||||
-rw-r--r-- | util.h | 29 | ||||
-rw-r--r-- | vhost_user.c | 60 | ||||
-rw-r--r-- | virtio.h | 4 | ||||
-rw-r--r-- | vu_common.c | 49 | ||||
-rw-r--r-- | vu_common.h | 2 |
11 files changed, 324 insertions, 115 deletions
@@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \ + tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c PASST_REPAIR_SRCS = passt-repair.c @@ -49,10 +49,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ - virtio.h vu_common.h + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ + pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \ + tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \ + vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/epoll_type.h b/epoll_type.h index fd9eac3..f3ef415 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -40,8 +40,6 @@ enum epoll_type { EPOLL_TYPE_VHOST_CMD, /* vhost-user kick event socket */ EPOLL_TYPE_VHOST_KICK, - /* vhost-user migration socket */ - EPOLL_TYPE_VHOST_MIGRATION, EPOLL_NUM_TYPES, }; diff --git a/migrate.c b/migrate.c new file mode 100644 index 0000000..aeac872 --- /dev/null +++ b/migrate.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * migrate.c - Migration sections, layout, and routines + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <errno.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "migrate.h" + +/* Magic identifier for migration data */ +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 + +/* Stages for version 1 */ +static const struct migrate_stage stages_v1[] = { + { 0 }, +}; + +/* Supported encoding versions, from latest (most preferred) to oldest */ +static const struct migrate_version versions[] = { + { 1, stages_v1, }, + { 0 }, +}; + +/* Current encoding version */ +#define CURRENT_VERSION (&versions[0]) + +/** + * migrate_source() - Migration as source, send state to hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_source(struct ctx *c, int fd) +{ + const struct migrate_version *v = CURRENT_VERSION; + const struct migrate_header header = { + .magic = htonll_constant(MIGRATE_MAGIC), + .version = htonl(v->id), + .compat_version = htonl(v->id), + }; + const struct migrate_stage *s; + int ret; + + if (write_all_buf(fd, &header, sizeof(header))) { + ret = errno; + err("Can't send migration header: %s, abort", strerror_(ret)); + return ret; + } + + for (s = v->s; s->name; s++) { + if (!s->source) + continue; + + debug("Source side migration stage: %s", s->name); + + if ((ret = s->source(c, s, fd))) { + err("Source migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_target_read_header() - Read header in target + * @fd: Descriptor for state transfer + * + * Return: version structure on success, NULL on failure with errno set + */ +static const struct migrate_version *migrate_target_read_header(int fd) +{ + const struct migrate_version *v; + struct migrate_header h; + uint32_t id, compat_id; + + if (read_all_buf(fd, &h, sizeof(h))) + return NULL; + + id = ntohl(h.version); + compat_id = ntohl(h.compat_version); + + debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u", + ntohll(h.magic), id, compat_id); + + if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) { + err("Invalid incoming device state"); + errno = EINVAL; + return NULL; + } + + for (v = versions; v->id; v++) + if (v->id <= id && v->id >= compat_id) + return v; + + errno = ENOTSUP; + err("Unsupported device state version: %u", id); + return NULL; +} + +/** + * migrate_target() - Migration as target, receive state from hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int migrate_target(struct ctx *c, int fd) +{ + const struct migrate_version *v; + const struct migrate_stage *s; + int ret; + + if (!(v = migrate_target_read_header(fd))) + return errno; + + for (s = v->s; s->name; s++) { + if (!s->target) + continue; + + debug("Target side migration stage: %s", s->name); + + if ((ret = s->target(c, s, fd))) { + err("Target migration stage: %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_init() - Set up things necessary for migration + * @c: Execution context + */ +void migrate_init(struct ctx *c) +{ + c->device_state_result = -1; +} + +/** + * migrate_close() - Close migration channel + * @c: Execution context + */ +void migrate_close(struct ctx *c) +{ + if (c->device_state_fd != -1) { + debug("Closing migration channel, fd: %d", c->device_state_fd); + close(c->device_state_fd); + c->device_state_fd = -1; + c->device_state_result = -1; + } +} + +/** + * migrate_request() - Request a migration of device state + * @c: Execution context + * @fd: fd to transfer state + * @target: Are we the target of the migration? + */ +void migrate_request(struct ctx *c, int fd, bool target) +{ + debug("Migration requested, fd: %d (was %d)", fd, c->device_state_fd); + + if (c->device_state_fd != -1) + migrate_close(c); + + c->device_state_fd = fd; + c->migrate_target = target; +} + +/** + * migrate_handler() - Send/receive passt internal state to/from hypervisor + * @c: Execution context + */ +void migrate_handler(struct ctx *c) +{ + int rc; + + if (c->device_state_fd < 0) + return; + + debug("Handling migration request from fd: %d, target: %d", + c->device_state_fd, c->migrate_target); + + if (c->migrate_target) + rc = migrate_target(c, c->device_state_fd); + else + rc = migrate_source(c, c->device_state_fd); + + migrate_close(c); + + c->device_state_result = rc; +} diff --git a/migrate.h b/migrate.h new file mode 100644 index 0000000..2c51cd9 --- /dev/null +++ b/migrate.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#ifndef MIGRATE_H +#define MIGRATE_H + +/** + * struct migrate_header - Migration header from source + * @magic: 0xB1BB1D1B0BB1D1B0, network order + * @version: Highest known, target aborts if too old, network order + * @compat_version: Lowest version compatible with @version, target aborts + * if too new, network order + */ +struct migrate_header { + uint64_t magic; + uint32_t version; + uint32_t compat_version; +} __attribute__((packed)); + +/** + * struct migrate_stage - Callbacks and parameters for one stage of migration + * @name: Stage name (for debugging) + * @source: Callback to implement this stage on the source + * @target: Callback to implement this stage on the target + */ +struct migrate_stage { + const char *name; + int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd); + int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd); + + /* Add here separate rollback callbacks if needed */ +}; + +/** + * struct migrate_version - Stages for a particular protocol version + * @id: Version number, host order + * @s: Ordered array of stages, NULL-terminated + */ +struct migrate_version { + uint32_t id; + const struct migrate_stage *s; +}; + +void migrate_init(struct ctx *c); +void migrate_close(struct ctx *c); +void migrate_request(struct ctx *c, int fd, bool target); +void migrate_handler(struct ctx *c); + +#endif /* MIGRATE_H */ @@ -51,6 +51,7 @@ #include "tcp_splice.h" #include "ndp.h" #include "vu_common.h" +#include "migrate.h" #define EPOLL_EVENTS 8 @@ -75,7 +76,6 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", - [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -202,6 +202,7 @@ int main(int argc, char **argv) isolate_initial(argc, argv); c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1; + c.device_state_fd = -1; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; @@ -357,9 +358,6 @@ loop: case EPOLL_TYPE_VHOST_KICK: vu_kick_cb(c.vdev, ref, &now); break; - case EPOLL_TYPE_VHOST_MIGRATION: - vu_migrate(c.vdev, eventmask); - break; default: /* Can't happen */ ASSERT(0); @@ -368,5 +366,7 @@ loop: post_handler(&c, &now); + migrate_handler(&c); + goto loop; } @@ -237,6 +237,9 @@ struct ip6_ctx { * @low_wmem: Low probed net.core.wmem_max * @low_rmem: Low probed net.core.rmem_max * @vdev: vhost-user device + * @device_state_fd: Device state migration channel + * @device_state_result: Device state migration result + * @migrate_target: Are we the target, on the next migration request? */ struct ctx { enum passt_modes mode; @@ -305,6 +308,11 @@ struct ctx { int low_rmem; struct vu_dev *vdev; + + /* Migration */ + int device_state_fd; + int device_state_result; + bool migrate_target; }; void proto_update_l2_buf(const unsigned char *eth_d, @@ -125,14 +125,43 @@ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) #endif +#ifndef __bswap_constant_32 +#define __bswap_constant_32(x) \ + ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) +#endif + +#ifndef __bswap_constant_64 +#define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ULL) >> 56) | \ + (((x) & 0x00ff000000000000ULL) >> 40) | \ + (((x) & 0x0000ff0000000000ULL) >> 24) | \ + (((x) & 0x000000ff00000000ULL) >> 8) | \ + (((x) & 0x00000000ff000000ULL) << 8) | \ + (((x) & 0x0000000000ff0000ULL) << 24) | \ + (((x) & 0x000000000000ff00ULL) << 40) | \ + (((x) & 0x00000000000000ffULL) << 56)) +#endif + #if __BYTE_ORDER == __BIG_ENDIAN #define htons_constant(x) (x) #define htonl_constant(x) (x) +#define htonll_constant(x) (x) +#define ntohs_constant(x) (x) +#define ntohl_constant(x) (x) +#define ntohll_constant(x) (x) #else #define htons_constant(x) (__bswap_constant_16(x)) #define htonl_constant(x) (__bswap_constant_32(x)) +#define htonll_constant(x) (__bswap_constant_64(x)) +#define ntohs_constant(x) (__bswap_constant_16(x)) +#define ntohl_constant(x) (__bswap_constant_32(x)) +#define ntohll_constant(x) (__bswap_constant_64(x)) #endif +#define ntohll(x) (be64toh((x))) +#define htonll(x) (htobe64((x))) + /** * ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address * @p: Pointer to the BE value in memory diff --git a/vhost_user.c b/vhost_user.c index 159f0b3..256c8ab 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -44,6 +44,7 @@ #include "tap.h" #include "vhost_user.h" #include "pcap.h" +#include "migrate.h" /* vhost-user version we are compatible with */ #define VHOST_USER_VERSION 1 @@ -998,36 +999,6 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev, } /** - * vu_set_migration_watch() - Add the migration file descriptor to epoll - * @vdev: vhost-user device - * @fd: File descriptor to add - * @direction: Direction of the migration (save or load backend state) - */ -static void vu_set_migration_watch(const struct vu_dev *vdev, int fd, - uint32_t direction) -{ - union epoll_ref ref = { - .type = EPOLL_TYPE_VHOST_MIGRATION, - .fd = fd, - }; - struct epoll_event ev = { 0 }; - - ev.data.u64 = ref.u64; - switch (direction) { - case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE: - ev.events = EPOLLOUT; - break; - case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD: - ev.events = EPOLLIN; - break; - default: - ASSERT(0); - } - - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); -} - -/** * vu_set_device_state_fd_exec() - Set the device state migration channel * @vdev: vhost-user device * @vmsg: vhost-user message @@ -1051,16 +1022,8 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) die("Invalide device_state_fd direction: %d", direction); - if (vdev->device_state_fd != -1) { - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - } - - vdev->device_state_fd = msg->fds[0]; - vdev->device_state_result = -1; - vu_set_migration_watch(vdev, vdev->device_state_fd, direction); - - debug("Got device_state_fd: %d", vdev->device_state_fd); + migrate_request(vdev->context, msg->fds[0], + direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD); /* We don't provide a new fd for the data transfer */ vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); @@ -1075,12 +1038,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, * * Return: True as the reply contains the migration result */ +/* cppcheck-suppress constParameterCallback */ static bool vu_check_device_state_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { - (void)vdev; - - vmsg_set_reply_u64(msg, vdev->device_state_result); + vmsg_set_reply_u64(msg, vdev->context->device_state_result); return true; } @@ -1106,8 +1068,8 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; - c->vdev->device_state_fd = -1; - c->vdev->device_state_result = -1; + + migrate_init(c); } @@ -1157,12 +1119,8 @@ void vu_cleanup(struct vu_dev *vdev) vu_close_log(vdev); - if (vdev->device_state_fd != -1) { - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - vdev->device_state_result = -1; - } + /* If we lose the VU dev, we also lose our migration channel */ + migrate_close(vdev->context); } /** @@ -106,8 +106,6 @@ struct vu_dev_region { * @log_call_fd: Eventfd to report logging update * @log_size: Size of the logging memory region * @log_table: Base of the logging memory region - * @device_state_fd: Device state migration channel - * @device_state_result: Device state migration result */ struct vu_dev { struct ctx *context; @@ -119,8 +117,6 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; - int device_state_fd; - int device_state_result; }; /** diff --git a/vu_common.c b/vu_common.c index ab04d31..48826b1 100644 --- a/vu_common.c +++ b/vu_common.c @@ -5,6 +5,7 @@ * common_vu.c - vhost-user common UDP and TCP functions */ +#include <errno.h> #include <unistd.h> #include <sys/uio.h> #include <sys/eventfd.h> @@ -17,6 +18,7 @@ #include "vhost_user.h" #include "pcap.h" #include "vu_common.h" +#include "migrate.h" #define VU_MAX_TX_BUFFER_NB 2 @@ -303,50 +305,3 @@ err: return -1; } - -/** - * vu_migrate() - Send/receive passt insternal state to/from QEMU - * @vdev: vhost-user device - * @events: epoll events - */ -void vu_migrate(struct vu_dev *vdev, uint32_t events) -{ - int ret; - - /* TODO: collect/set passt internal state - * and use vdev->device_state_fd to send/receive it - */ - debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); - if (events & EPOLLOUT) { - debug("Saving backend state"); - - /* send some stuff */ - ret = write(vdev->device_state_fd, "PASST", 6); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - vdev->device_state_result = ret == -1 ? -1 : 0; - /* Closing the file descriptor signals the end of transfer */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } else if (events & EPOLLIN) { - char buf[6]; - - debug("Loading backend state"); - /* read some stuff */ - ret = read(vdev->device_state_fd, buf, sizeof(buf)); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - if (ret != sizeof(buf)) { - vdev->device_state_result = -1; - } else { - ret = strncmp(buf, "PASST", sizeof(buf)); - vdev->device_state_result = ret == 0 ? 0 : -1; - } - } else if (events & EPOLLHUP) { - debug("Closing migration channel"); - - /* The end of file signals the end of the transfer. */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } -} diff --git a/vu_common.h b/vu_common.h index d56c021..f538f23 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); -void vu_migrate(struct vu_dev *vdev, uint32_t events); + #endif /* VU_COMMON_H */ |