From c140a5bd5da48de353be8bf9356f71a66f622a68 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 18 Oct 2023 14:36:40 +0200 Subject: [PATCH 01/41] um: irqs: process outstanding IRQs when unblocking signals When in time-travel mode, the eventfd events are read even when signals are blocked as SIGIO still needs to be processed. In this case, the event is cleared on the eventfd but the IRQ still needs to be fired later. We did already ensure that the SIGIO handler is run again. However, the FDs are configured to be level triggered, so that eventfd will not notify again. As such, add some logic to mark the IRQ as pending and process it at the next opportunity. To avoid duplication, reuse the logic used for the suspend/resume case. This does not really change anything except for delaying running the IRQs with timetravel_handler at a slightly later point in time (and possibly running non-timetravel IRQs that shouldn't happen earlier). While at it, move marking as pending into irq_event_handler as that is the more logical place for it to happen. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20231018123643.1255813-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/irq.c | 78 ++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 635d44606bfe..ceda4bd2e5ed 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -37,7 +37,7 @@ struct irq_reg { bool pending; bool wakeup; #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT - bool pending_on_resume; + bool pending_event; void (*timetravel_handler)(int, int, void *, struct time_travel_event *); struct time_travel_event event; @@ -56,6 +56,9 @@ static DEFINE_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static bool irqs_pending; +#endif static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) { @@ -84,9 +87,12 @@ static void irq_event_handler(struct time_travel_event *ev) { struct irq_reg *reg = container_of(ev, struct irq_reg, event); - /* do nothing if suspended - just to cause a wakeup */ - if (irqs_suspended) + /* do nothing if suspended; just cause a wakeup and mark as pending */ + if (irqs_suspended) { + irqs_pending = true; + reg->pending_event = true; return; + } generic_handle_irq(reg->irq); } @@ -110,16 +116,47 @@ static bool irq_do_timetravel_handler(struct irq_entry *entry, if (!reg->event.pending) return false; - if (irqs_suspended) - reg->pending_on_resume = true; return true; } + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ + struct irq_entry *entry; + + if (!irqs_pending || timetravel_handlers_only) + return; + + irqs_pending = false; + + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + struct irq_reg *reg = &entry->reg[t]; + + /* + * Any timetravel_handler was invoked already, just + * directly run the IRQ. + */ + if (reg->pending_event) { + irq_enter(); + generic_handle_irq(reg->irq); + irq_exit(); + reg->pending_event = false; + } + } + } +} #else static bool irq_do_timetravel_handler(struct irq_entry *entry, enum um_irq_type t) { return false; } + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ +} #endif static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, @@ -145,6 +182,8 @@ static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type */ if (timetravel_handlers_only) { #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + reg->pending_event = true; + irqs_pending = true; mark_sigio_pending(); #endif return; @@ -162,6 +201,10 @@ static void _sigio_handler(struct uml_pt_regs *regs, if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) return; + /* Flush out pending events that were ignored due to time-travel. */ + if (!irqs_suspended) + irq_do_pending_events(timetravel_handlers_only); + while (1) { /* This is now lockless - epoll keeps back-referencesto the irqs * which have trigger it so there is no need to walk the irq @@ -543,30 +586,7 @@ void um_irqs_resume(void) unsigned long flags; - local_irq_save(flags); -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT - /* - * We don't need to lock anything here since we're in resume - * and nothing else is running, but have disabled IRQs so we - * don't try anything else with the interrupt list from there. - */ - list_for_each_entry(entry, &active_fds, list) { - enum um_irq_type t; - - for (t = 0; t < NUM_IRQ_TYPES; t++) { - struct irq_reg *reg = &entry->reg[t]; - - if (reg->pending_on_resume) { - irq_enter(); - generic_handle_irq(reg->irq); - irq_exit(); - reg->pending_on_resume = false; - } - } - } -#endif - - spin_lock(&irq_lock); + spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); From c6c4cbaa01b61eff2232d8cf4d441565535aa53a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 18 Oct 2023 14:36:41 +0200 Subject: [PATCH 02/41] um: chan_user: catch EINTR when reading and writing If the read/write function returns an error then we expect to see an event/IRQ later on. However, this will only happen after an EAGAIN as we are using edge based event triggering. As such, EINTR needs to be caught should it happen. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20231018123643.1255813-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/chan_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index ec04e47b9d79..60fcb181835d 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -23,7 +23,7 @@ int generic_read(int fd, __u8 *c_out, void *unused) { int n; - n = read(fd, c_out, sizeof(*c_out)); + CATCH_EINTR(n = read(fd, c_out, sizeof(*c_out))); if (n > 0) return n; else if (n == 0) @@ -39,7 +39,7 @@ int generic_write(int fd, const __u8 *buf, size_t n, void *unused) { int err; - err = write(fd, buf, n); + CATCH_EINTR(err = write(fd, buf, n)); if (err > 0) return err; else if (errno == EAGAIN) From 4cfb44df8d12a5fc2b801d18dc41b03bf7e6cd7b Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 18 Oct 2023 14:36:42 +0200 Subject: [PATCH 03/41] um: chan_user: retry partial writes In the next commit, we are going to set the output FD to be blocking. Once that is done, the write() may be short if an interrupt happens while it is writing out data. As such, to properly catch an EINTR error, we need to retry the write. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20231018123643.1255813-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/chan_user.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index 60fcb181835d..a66e556012c4 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -37,11 +37,23 @@ int generic_read(int fd, __u8 *c_out, void *unused) int generic_write(int fd, const __u8 *buf, size_t n, void *unused) { + int written = 0; int err; - CATCH_EINTR(err = write(fd, buf, n)); - if (err > 0) - return err; + /* The FD may be in blocking mode, as such, need to retry short writes, + * they may have been interrupted by a signal. + */ + do { + errno = 0; + err = write(fd, buf + written, n - written); + if (err > 0) { + written += err; + continue; + } + } while (err < 0 && errno == EINTR); + + if (written > 0) + return written; else if (errno == EAGAIN) return 0; else if (err == 0) From b2f9b77c7f7f377534ba75508162390432a9945a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 18 Oct 2023 14:36:43 +0200 Subject: [PATCH 04/41] um: chan: use blocking IO for console output for time-travel When in time-travel mode (infinite-cpu or external) time should not pass for writing to the console. As such, it makes sense to put the FD for the output side into blocking mode and simply let any write to it hang. If we did not do this, then time could pass waiting for the console to become writable again. This is not desirable as it has random effects on the clock between runs. Implement this by duplicating the FD if output is active in a relevant mode and setting the duplicate to be blocking. This avoids changing the input channel to be blocking should it exists. After this, use the blocking FD for all write operations and do not allocate an IRQ it is set. Without time-travel mode fd_out will always match fd_in and IRQs are registered. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20231018123643.1255813-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/chan.h | 3 +- arch/um/drivers/chan_kern.c | 81 ++++++++++++++++++++++++++++--------- arch/um/include/shared/os.h | 1 + arch/um/os-Linux/file.c | 10 +++++ 4 files changed, 74 insertions(+), 21 deletions(-) diff --git a/arch/um/drivers/chan.h b/arch/um/drivers/chan.h index e14b9cdf7a33..5a61db512ffb 100644 --- a/arch/um/drivers/chan.h +++ b/arch/um/drivers/chan.h @@ -22,7 +22,8 @@ struct chan { unsigned int output:1; unsigned int opened:1; unsigned int enabled:1; - int fd; + int fd_in; + int fd_out; /* only different to fd_in if blocking output is needed */ const struct chan_ops *ops; void *data; }; diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c index 37538b4168da..e78a99816c86 100644 --- a/arch/um/drivers/chan_kern.c +++ b/arch/um/drivers/chan_kern.c @@ -81,6 +81,12 @@ static const struct chan_ops not_configged_ops = { }; #endif /* CONFIG_NOCONFIG_CHAN */ +static inline bool need_output_blocking(void) +{ + return time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL; +} + static int open_one_chan(struct chan *chan) { int fd, err; @@ -96,15 +102,43 @@ static int open_one_chan(struct chan *chan) return fd; err = os_set_fd_block(fd, 0); - if (err) { - (*chan->ops->close)(fd, chan->data); - return err; - } + if (err) + goto out_close; - chan->fd = fd; + chan->fd_in = fd; + chan->fd_out = fd; + + /* + * In time-travel modes infinite-CPU and external we need to guarantee + * that any writes to the output succeed immdiately from the point of + * the VM. The best way to do this is to put the FD in blocking mode + * and simply wait/retry until everything is written. + * As every write is guaranteed to complete, we also do not need to + * request an IRQ for the output. + * + * Note that input cannot happen in a time synchronized way. We permit + * it, but time passes very quickly if anything waits for a read. + */ + if (chan->output && need_output_blocking()) { + err = os_dup_file(chan->fd_out); + if (err < 0) + goto out_close; + + chan->fd_out = err; + + err = os_set_fd_block(chan->fd_out, 1); + if (err) { + os_close_file(chan->fd_out); + goto out_close; + } + } chan->opened = 1; return 0; + +out_close: + (*chan->ops->close)(fd, chan->data); + return err; } static int open_chan(struct list_head *chans) @@ -125,7 +159,7 @@ static int open_chan(struct list_head *chans) void chan_enable_winch(struct chan *chan, struct tty_port *port) { if (chan && chan->primary && chan->ops->winch) - register_winch(chan->fd, port); + register_winch(chan->fd_in, port); } static void line_timer_cb(struct work_struct *work) @@ -156,8 +190,9 @@ int enable_chan(struct line *line) if (chan->enabled) continue; - err = line_setup_irq(chan->fd, chan->input, chan->output, line, - chan); + err = line_setup_irq(chan->fd_in, chan->input, + chan->output && !need_output_blocking(), + line, chan); if (err) goto out_close; @@ -196,7 +231,8 @@ void free_irqs(void) if (chan->input && chan->enabled) um_free_irq(chan->line->read_irq, chan); - if (chan->output && chan->enabled) + if (chan->output && chan->enabled && + !need_output_blocking()) um_free_irq(chan->line->write_irq, chan); chan->enabled = 0; } @@ -216,15 +252,19 @@ static void close_one_chan(struct chan *chan, int delay_free_irq) } else { if (chan->input && chan->enabled) um_free_irq(chan->line->read_irq, chan); - if (chan->output && chan->enabled) + if (chan->output && chan->enabled && + !need_output_blocking()) um_free_irq(chan->line->write_irq, chan); chan->enabled = 0; } + if (chan->fd_out != chan->fd_in) + os_close_file(chan->fd_out); if (chan->ops->close != NULL) - (*chan->ops->close)(chan->fd, chan->data); + (*chan->ops->close)(chan->fd_in, chan->data); chan->opened = 0; - chan->fd = -1; + chan->fd_in = -1; + chan->fd_out = -1; } void close_chan(struct line *line) @@ -244,7 +284,7 @@ void close_chan(struct line *line) void deactivate_chan(struct chan *chan, int irq) { if (chan && chan->enabled) - deactivate_fd(chan->fd, irq); + deactivate_fd(chan->fd_in, irq); } int write_chan(struct chan *chan, const u8 *buf, size_t len, int write_irq) @@ -254,7 +294,7 @@ int write_chan(struct chan *chan, const u8 *buf, size_t len, int write_irq) if (len == 0 || !chan || !chan->ops->write) return 0; - n = chan->ops->write(chan->fd, buf, len, chan->data); + n = chan->ops->write(chan->fd_out, buf, len, chan->data); if (chan->primary) { ret = n; } @@ -268,7 +308,7 @@ int console_write_chan(struct chan *chan, const char *buf, int len) if (!chan || !chan->ops->console_write) return 0; - n = chan->ops->console_write(chan->fd, buf, len); + n = chan->ops->console_write(chan->fd_out, buf, len); if (chan->primary) ret = n; return ret; @@ -296,14 +336,14 @@ int chan_window_size(struct line *line, unsigned short *rows_out, if (chan && chan->primary) { if (chan->ops->window_size == NULL) return 0; - return chan->ops->window_size(chan->fd, chan->data, + return chan->ops->window_size(chan->fd_in, chan->data, rows_out, cols_out); } chan = line->chan_out; if (chan && chan->primary) { if (chan->ops->window_size == NULL) return 0; - return chan->ops->window_size(chan->fd, chan->data, + return chan->ops->window_size(chan->fd_in, chan->data, rows_out, cols_out); } return 0; @@ -319,7 +359,7 @@ static void free_one_chan(struct chan *chan) (*chan->ops->free)(chan->data); if (chan->primary && chan->output) - ignore_sigio_fd(chan->fd); + ignore_sigio_fd(chan->fd_in); kfree(chan); } @@ -478,7 +518,8 @@ static struct chan *parse_chan(struct line *line, char *str, int device, .output = 0, .opened = 0, .enabled = 0, - .fd = -1, + .fd_in = -1, + .fd_out = -1, .ops = ops, .data = data }); return chan; @@ -549,7 +590,7 @@ void chan_interrupt(struct line *line, int irq) schedule_delayed_work(&line->task, 1); goto out; } - err = chan->ops->read(chan->fd, &c, chan->data); + err = chan->ops->read(chan->fd_in, &c, chan->data); if (err > 0) tty_insert_flip_char(port, c, TTY_NORMAL); } while (err > 0); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index aff8906304ea..6516ff5a0ecc 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -163,6 +163,7 @@ extern int os_set_fd_block(int fd, int blocking); extern int os_accept_connection(int fd); extern int os_create_unix_socket(const char *file, int len, int close_on_exec); extern int os_shutdown_socket(int fd, int r, int w); +extern int os_dup_file(int fd); extern void os_close_file(int fd); extern int os_rcv_fd(int fd, int *helper_pid_out); extern int os_connect_socket(const char *name); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index fc4450db59bd..abf8676b834c 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -240,6 +240,16 @@ int os_connect_socket(const char *name) return err; } +int os_dup_file(int fd) +{ + int new_fd = dup(fd); + + if (new_fd < 0) + return -errno; + + return new_fd; +} + void os_close_file(int fd) { close(fd); From 12b8e7e69aa7a008323d7a903392686ad0903828 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Thu, 28 Mar 2024 13:24:24 +0000 Subject: [PATCH 05/41] um: Remove obsolete pcap driver Remove the pcap driver in UML. It is obsolete. It does not build on recent systems due to changes in libpcap and its dependencies. The vector driver's raw transport in UML provides identical functionality. Signed-off-by: Anton Ivanov Link: https://patch.msgid.link/20240328132424.376456-1-anton.ivanov@cambridgegreys.com Signed-off-by: Johannes Berg --- arch/um/drivers/Kconfig | 20 ------ arch/um/drivers/Makefile | 10 +-- arch/um/drivers/pcap_kern.c | 113 ----------------------------- arch/um/drivers/pcap_user.c | 137 ------------------------------------ arch/um/drivers/pcap_user.h | 21 ------ 5 files changed, 2 insertions(+), 299 deletions(-) delete mode 100644 arch/um/drivers/pcap_kern.c delete mode 100644 arch/um/drivers/pcap_user.c delete mode 100644 arch/um/drivers/pcap_user.h diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index b94b2618e7d8..ede40a160c5e 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -297,26 +297,6 @@ config UML_NET_MCAST If unsure, say N. -config UML_NET_PCAP - bool "pcap transport (obsolete)" - depends on UML_NET - depends on !MODVERSIONS - select MAY_HAVE_RUNTIME_DEPS - help - The pcap transport makes a pcap packet stream on the host look - like an ethernet device inside UML. This is useful for making - UML act as a network monitor for the host. You must have libcap - installed in order to build the pcap transport into UML. - - For more information, see - That site - has examples of the UML command line to use to enable this option. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - config UML_NET_SLIRP bool "SLiRP transport (obsolete)" depends on UML_NET diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 0e6af81096fd..57882e6bc215 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -20,14 +20,9 @@ harddog-objs := harddog_kern.o harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o rtc-objs := rtc_kern.o rtc_user.o -LDFLAGS_pcap.o = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libpcap.a) - LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) -targets := pcap_kern.o pcap_user.o vde_kern.o vde_user.o - -$(obj)/pcap.o: $(obj)/pcap_kern.o $(obj)/pcap_user.o - $(LD) -r -dp -o $@ $^ $(ld_flags) +targets := vde_kern.o vde_user.o $(obj)/vde.o: $(obj)/vde_kern.o $(obj)/vde_user.o $(LD) -r -dp -o $@ $^ $(ld_flags) @@ -49,7 +44,6 @@ obj-$(CONFIG_UML_NET_DAEMON) += daemon.o obj-$(CONFIG_UML_NET_VECTOR) += vector.o obj-$(CONFIG_UML_NET_VDE) += vde.o obj-$(CONFIG_UML_NET_MCAST) += umcast.o -obj-$(CONFIG_UML_NET_PCAP) += pcap.o obj-$(CONFIG_UML_NET) += net.o obj-$(CONFIG_MCONSOLE) += mconsole.o obj-$(CONFIG_MMAPPER) += mmapper_kern.o @@ -69,7 +63,7 @@ obj-$(CONFIG_UML_RTC) += rtc.o obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virt-pci.o # pcap_user.o must be added explicitly. -USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o +USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"' diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c deleted file mode 100644 index d9bf95d7867b..000000000000 --- a/arch/um/drivers/pcap_kern.c +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include "pcap_user.h" - -struct pcap_init { - char *host_if; - int promisc; - int optimize; - char *filter; -}; - -static void pcap_init_kern(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct pcap_data *ppri; - struct pcap_init *init = data; - - pri = netdev_priv(dev); - ppri = (struct pcap_data *) pri->user; - ppri->host_if = init->host_if; - ppri->promisc = init->promisc; - ppri->optimize = init->optimize; - ppri->filter = init->filter; - - printk("pcap backend, host interface %s\n", ppri->host_if); -} - -static int pcap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return pcap_user_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER, - (struct pcap_data *) &lp->user); -} - -static int pcap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return -EPERM; -} - -static const struct net_kern_info pcap_kern_info = { - .init = pcap_init_kern, - .protocol = eth_protocol, - .read = pcap_read, - .write = pcap_write, -}; - -static int pcap_setup(char *str, char **mac_out, void *data) -{ - struct pcap_init *init = data; - char *remain, *host_if = NULL, *options[2] = { NULL, NULL }; - int i; - - *init = ((struct pcap_init) - { .host_if = "eth0", - .promisc = 1, - .optimize = 0, - .filter = NULL }); - - remain = split_if_spec(str, &host_if, &init->filter, - &options[0], &options[1], mac_out, NULL); - if (remain != NULL) { - printk(KERN_ERR "pcap_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (host_if != NULL) - init->host_if = host_if; - - for (i = 0; i < ARRAY_SIZE(options); i++) { - if (options[i] == NULL) - continue; - if (!strcmp(options[i], "promisc")) - init->promisc = 1; - else if (!strcmp(options[i], "nopromisc")) - init->promisc = 0; - else if (!strcmp(options[i], "optimize")) - init->optimize = 1; - else if (!strcmp(options[i], "nooptimize")) - init->optimize = 0; - else { - printk(KERN_ERR "pcap_setup : bad option - '%s'\n", - options[i]); - return 0; - } - } - - return 1; -} - -static struct transport pcap_transport = { - .list = LIST_HEAD_INIT(pcap_transport.list), - .name = "pcap", - .setup = pcap_setup, - .user = &pcap_user_info, - .kern = &pcap_kern_info, - .private_size = sizeof(struct pcap_data), - .setup_size = sizeof(struct pcap_init), -}; - -static int register_pcap(void) -{ - register_transport(&pcap_transport); - return 0; -} - -late_initcall(register_pcap); diff --git a/arch/um/drivers/pcap_user.c b/arch/um/drivers/pcap_user.c deleted file mode 100644 index 52ddda3e3b10..000000000000 --- a/arch/um/drivers/pcap_user.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include "pcap_user.h" -#include - -#define PCAP_FD(p) (*(int *)(p)) - -static int pcap_user_init(void *data, void *dev) -{ - struct pcap_data *pri = data; - pcap_t *p; - char errors[PCAP_ERRBUF_SIZE]; - - p = pcap_open_live(pri->host_if, ETH_MAX_PACKET + ETH_HEADER_OTHER, - pri->promisc, 0, errors); - if (p == NULL) { - printk(UM_KERN_ERR "pcap_user_init : pcap_open_live failed - " - "'%s'\n", errors); - return -EINVAL; - } - - pri->dev = dev; - pri->pcap = p; - return 0; -} - -static int pcap_user_open(void *data) -{ - struct pcap_data *pri = data; - __u32 netmask; - int err; - - if (pri->pcap == NULL) - return -ENODEV; - - if (pri->filter != NULL) { - err = dev_netmask(pri->dev, &netmask); - if (err < 0) { - printk(UM_KERN_ERR "pcap_user_open : dev_netmask failed\n"); - return -EIO; - } - - pri->compiled = uml_kmalloc(sizeof(struct bpf_program), - UM_GFP_KERNEL); - if (pri->compiled == NULL) { - printk(UM_KERN_ERR "pcap_user_open : kmalloc failed\n"); - return -ENOMEM; - } - - err = pcap_compile(pri->pcap, - (struct bpf_program *) pri->compiled, - pri->filter, pri->optimize, netmask); - if (err < 0) { - printk(UM_KERN_ERR "pcap_user_open : pcap_compile failed - " - "'%s'\n", pcap_geterr(pri->pcap)); - goto out; - } - - err = pcap_setfilter(pri->pcap, pri->compiled); - if (err < 0) { - printk(UM_KERN_ERR "pcap_user_open : pcap_setfilter " - "failed - '%s'\n", pcap_geterr(pri->pcap)); - goto out; - } - } - - return PCAP_FD(pri->pcap); - - out: - kfree(pri->compiled); - return -EIO; -} - -static void pcap_remove(void *data) -{ - struct pcap_data *pri = data; - - if (pri->compiled != NULL) - pcap_freecode(pri->compiled); - - if (pri->pcap != NULL) - pcap_close(pri->pcap); -} - -struct pcap_handler_data { - char *buffer; - int len; -}; - -static void handler(u_char *data, const struct pcap_pkthdr *header, - const u_char *packet) -{ - int len; - - struct pcap_handler_data *hdata = (struct pcap_handler_data *) data; - - len = hdata->len < header->caplen ? hdata->len : header->caplen; - memcpy(hdata->buffer, packet, len); - hdata->len = len; -} - -int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri) -{ - struct pcap_handler_data hdata = ((struct pcap_handler_data) - { .buffer = buffer, - .len = len }); - int n; - - n = pcap_dispatch(pri->pcap, 1, handler, (u_char *) &hdata); - if (n < 0) { - printk(UM_KERN_ERR "pcap_dispatch failed - %s\n", - pcap_geterr(pri->pcap)); - return -EIO; - } - else if (n == 0) - return 0; - return hdata.len; -} - -const struct net_user_info pcap_user_info = { - .init = pcap_user_init, - .open = pcap_user_open, - .close = NULL, - .remove = pcap_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/pcap_user.h b/arch/um/drivers/pcap_user.h deleted file mode 100644 index 216246f5f09b..000000000000 --- a/arch/um/drivers/pcap_user.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - */ - -#include - -struct pcap_data { - char *host_if; - int promisc; - int optimize; - char *filter; - void *compiled; - void *pcap; - void *dev; -}; - -extern const struct net_user_info pcap_user_info; - -extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri); - From ddd268c42871b78c75e12a5c28207fb481138f41 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 3 Apr 2024 14:43:00 +0200 Subject: [PATCH 06/41] um: Select HAS_IOREMAP for UML_IOMEM_EMULATION In a future patch HAS_IOPORT=n will disable inb()/outb() and friends at compile time. UML supports these via its UML_IOMEM_EMULATION so let that select HAS_IOPORT and also reflect this in NO_IOPORT_MAP. Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Niklas Schnelle Link: https://patch.msgid.link/20240403124300.65379-2-schnelle@linux.ibm.com Signed-off-by: Johannes Berg --- arch/um/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 93a5a8999b07..758016731156 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -48,12 +48,13 @@ config NO_IOMEM config UML_IOMEM_EMULATION bool select INDIRECT_IOMEM + select HAS_IOPORT select GENERIC_PCI_IOMAP select GENERIC_IOMAP select NO_GENERIC_PCI_IOPORT_MAP config NO_IOPORT_MAP - def_bool y + def_bool !UML_IOMEM_EMULATION config ISA bool From 7d0a8a490aa3a2a82de8826aaf1dfa38575cb77a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Apr 2024 10:27:45 +0200 Subject: [PATCH 07/41] um: time-travel: fix time-travel-start option We need to have the = as part of the option so that the value can be parsed properly. Also document that it must be given in nanoseconds, not seconds. Fixes: 065038706f77 ("um: Support time travel mode") Link: https://patch.msgid.link/20240417102744.14b9a9d4eba0.Ib22e9136513126b2099d932650f55f193120cd97@changeid Signed-off-by: Johannes Berg --- arch/um/kernel/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index a8bfe8be1526..5b5fd8f68d9c 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -875,9 +875,9 @@ static int setup_time_travel_start(char *str) return 1; } -__setup("time-travel-start", setup_time_travel_start); +__setup("time-travel-start=", setup_time_travel_start); __uml_help(setup_time_travel_start, -"time-travel-start=\n" +"time-travel-start=\n" "Configure the UML instance's wall clock to start at this value rather than\n" "the host's wall clock at the time of UML boot.\n"); #endif From 1cf855ded6962825da8623b467c4650a80776cad Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 5 May 2024 01:15:08 +0100 Subject: [PATCH 08/41] ubd: Remove unused mutex 'ubd_mutex' Commit fb5d1d389c9e ("ubd: open the backing files in ubd_add") removed the last use of ubd_mutex. Remove it. Build and kernel startup test only. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20240505001508.255096-1-linux@treblig.org Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_kern.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ef805eaa9e01..1542a6d6a7a1 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -106,7 +106,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) #define DRIVER_NAME "uml-blkdev" static DEFINE_MUTEX(ubd_lock); -static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); From 6fdae1da764abeff77d0d884101f42c114b8f93b Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 27 May 2024 21:40:23 +0800 Subject: [PATCH 09/41] um: Remove unused ncpus variable It's no longer used. And uml_ncpus_setup doesn't exist anymore. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240527134024.1539848-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/include/shared/kern_util.h | 1 - arch/um/kernel/um_arch.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 95521b1f5b20..d8ffd2db168e 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -13,7 +13,6 @@ struct siginfo; extern int uml_exitcode; -extern int ncpus; extern int kmalloc_ok; #define UML_ROUND_UP(addr) \ diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index e95f805e5004..8e594cda6d77 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -126,9 +126,6 @@ unsigned long uml_reserved; /* Also modified in mem_init */ unsigned long start_vm; unsigned long end_vm; -/* Set in uml_ncpus_setup */ -int ncpus = 1; - /* Set in early boot */ static int have_root __initdata; static int have_console __initdata; From cb2759431acae9c6093f6f4cb270e3a3bd0f4e73 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 27 May 2024 21:40:24 +0800 Subject: [PATCH 10/41] um: Remove /proc/sysemu support code Currently /proc/sysemu will never be registered, as sysemu_supported is initialized to zero implicitly and no code updates it. And there is also nothing to configure via sysemu in UML anymore. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20240527134024.1539848-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/process.c | 67 ---------------------------------------- 1 file changed, 67 deletions(-) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index d2134802f6a8..79f41f06a98f 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -237,73 +237,6 @@ int copy_from_user_proc(void *to, void __user *from, int size) return copy_from_user(to, from, size); } -static atomic_t using_sysemu = ATOMIC_INIT(0); -int sysemu_supported; - -static void set_using_sysemu(int value) -{ - if (value > sysemu_supported) - return; - atomic_set(&using_sysemu, value); -} - -static int get_using_sysemu(void) -{ - return atomic_read(&using_sysemu); -} - -static int sysemu_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", get_using_sysemu()); - return 0; -} - -static int sysemu_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, sysemu_proc_show, NULL); -} - -static ssize_t sysemu_proc_write(struct file *file, const char __user *buf, - size_t count, loff_t *pos) -{ - char tmp[2]; - - if (copy_from_user(tmp, buf, 1)) - return -EFAULT; - - if (tmp[0] >= '0' && tmp[0] <= '2') - set_using_sysemu(tmp[0] - '0'); - /* We use the first char, but pretend to write everything */ - return count; -} - -static const struct proc_ops sysemu_proc_ops = { - .proc_open = sysemu_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, - .proc_write = sysemu_proc_write, -}; - -static int __init make_proc_sysemu(void) -{ - struct proc_dir_entry *ent; - if (!sysemu_supported) - return 0; - - ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_ops); - - if (ent == NULL) - { - printk(KERN_WARNING "Failed to register /proc/sysemu\n"); - return 0; - } - - return 0; -} - -late_initcall(make_proc_sysemu); - int singlestepping(void) { return test_thread_flag(TIF_SINGLESTEP); From 9a2123b397bbe0da5e853273369d63779ac97c8c Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 29 May 2024 17:33:35 +0800 Subject: [PATCH 11/41] arch: um: rust: Use the generated target.json again The Rust compiler can take a target config from 'target.json', which is generated by scripts/generate_rust_target.rs. It used to be that all Linux architectures used this to generate a target.json, but now architectures must opt-in to this, or they will default to the Rust compiler's built-in target definition. This is mostly okay for (64-bit) x86 and UML, except that it can generate SSE instructions, which we can't use in the kernel. So re-instate the custom target.json, which disables SSE (and generally enables the 'soft-float' feature). This fixes the following compile error: error: :0:0: in function _RNvMNtCs5QSdWC790r4_4core3f32f7next_up float (float): SSE register return with SSE disabled Fixes: f82811e22b48 ("rust: Refactor the build target to allow the use of builtin targets") Signed-off-by: David Gow Reviewed-by: Boqun Feng Tested-by: Miguel Ojeda Reviewed-by: Miguel Ojeda Link: https://patch.msgid.link/20240529093336.4075206-1-davidgow@google.com Signed-off-by: Johannes Berg --- arch/x86/Makefile.um | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 2106a2bd152b..a46b1397ad01 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -9,6 +9,7 @@ core-y += arch/x86/crypto/ # ifeq ($(CONFIG_CC_IS_CLANG),y) KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 endif From ab0f4cedc3554f921691ce5b63d59e258154e799 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 5 Jun 2024 06:40:50 +0800 Subject: [PATCH 12/41] arch: um: rust: Add i386 support for Rust At present, Rust in the kernel only supports 64-bit x86, so UML has followed suit. However, it's significantly easier to support 32-bit i386 on UML than on bare metal, as UML does not use the -mregparm option (which alters the ABI), which is not yet supported by rustc[1]. Add support for CONFIG_RUST on um/i386, by adding a new target config to generate_rust_target, and replacing various checks on CONFIG_X86_64 to also support CONFIG_X86_32. We still use generate_rust_target, rather than a built-in rustc target, in order to match x86_64, provide a future place for -mregparm, and more easily disable floating point instructions. With these changes, the KUnit tests pass with: kunit.py run --make_options LLVM=1 --kconfig_add CONFIG_RUST=y --kconfig_add CONFIG_64BIT=n --kconfig_add CONFIG_FORTIFY_SOURCE=n An earlier version of these changes was proposed on the Rust-for-Linux github[2]. [1]: https://github.com/rust-lang/rust/issues/116972 [2]: https://github.com/Rust-for-Linux/linux/pull/966 Signed-off-by: David Gow Link: https://patch.msgid.link/20240604224052.3138504-1-davidgow@google.com Signed-off-by: Johannes Berg --- Documentation/rust/arch-support.rst | 2 +- arch/um/Kconfig | 2 +- rust/Makefile | 2 +- scripts/Makefile | 2 +- scripts/generate_rust_target.rs | 17 +++++++++++++++++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst index b13e19d84744..750ff371570a 100644 --- a/Documentation/rust/arch-support.rst +++ b/Documentation/rust/arch-support.rst @@ -18,7 +18,7 @@ Architecture Level of support Constraints ``arm64`` Maintained Little Endian only. ``loongarch`` Maintained \- ``riscv`` Maintained ``riscv64`` only. -``um`` Maintained ``x86_64`` only. +``um`` Maintained \- ``x86`` Maintained ``x86_64`` only. ============= ================ ============================================== diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 758016731156..e08a4a1a4bb6 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -31,7 +31,7 @@ config UML select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c select HAVE_ARCH_VMAP_STACK - select HAVE_RUST if X86_64 + select HAVE_RUST config MMU bool diff --git a/rust/Makefile b/rust/Makefile index f70d5e244fee..83f675adbfab 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -426,7 +426,7 @@ $(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--re $(obj)/core.o: private rustc_target_flags = $(core-cfgs) $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs FORCE +$(call if_changed_dep,rustc_library) -ifdef CONFIG_X86_64 +ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),) $(obj)/core.o: scripts/target.json endif diff --git a/scripts/Makefile b/scripts/Makefile index fe56eeef09dd..dccef663ca82 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -12,7 +12,7 @@ hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_builder hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_gen -ifdef CONFIG_X86_64 +ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),) always-$(CONFIG_RUST) += target.json filechk_rust_target = $< < include/config/auto.conf diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 641b713a033a..87f34925eb7b 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -169,6 +169,23 @@ fn main() { ts.push("features", features); ts.push("llvm-target", "x86_64-linux-gnu"); ts.push("target-pointer-width", "64"); + } else if cfg.has("X86_32") { + // This only works on UML, as i386 otherwise needs regparm support in rustc + if !cfg.has("UML") { + panic!("32-bit x86 only works under UML"); + } + ts.push("arch", "x86"); + ts.push( + "data-layout", + "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128", + ); + let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string(); + if cfg.has("MITIGATION_RETPOLINE") { + features += ",+retpoline-external-thunk"; + } + ts.push("features", features); + ts.push("llvm-target", "i386-unknown-linux-gnu"); + ts.push("target-pointer-width", "32"); } else if cfg.has("LOONGARCH") { panic!("loongarch uses the builtin rustc loongarch64-unknown-none-softfloat target"); } else { From 5cd93c7532ea31108e83720cbe25589a4ee57c50 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 15 Jun 2024 03:41:50 +0000 Subject: [PATCH 13/41] um/mm: remove redundant assignment of max_low_pfn Current calculation of max_low_pfn is introduced in commit af84eab20891 ("[PATCH] uml: fix LVM crash"). It is intended to set max_low_pfn to the same value as max_pfn. But I am not sure why the max_pfn is set to totalram_pages, which represents the number of usable pages in system instead of an absolute page frame number. (The change history stops there.) While we have already calculate it in setup_physmem(), so not necessary to do it again. Also this would help changing totalram_pages accounting, since we plan to move the accounting into __free_pages_core(). With this change, totalram_pages may not represent the total usable pages at this point, since some pages would be deferred initialized. Signed-off-by: Wei Yang CC: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Alasdair G Kergon CC: Andrew Morton CC: Mike Rapoport (IBM) CC: David Hildenbrand Acked-by: Mike Rapoport (IBM) Link: https://patch.msgid.link/20240615034150.2958-1-richard.weiyang@gmail.com Signed-off-by: Johannes Berg --- arch/um/kernel/mem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index ca91accd64fc..a5b4fe2ad931 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -73,7 +73,6 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); - max_low_pfn = totalram_pages(); max_pfn = max_low_pfn; kmalloc_ok = 1; } From 53585f9ea40a9466ab9c1151f15984513eb542f7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 1 Jul 2024 22:00:35 +0200 Subject: [PATCH 14/41] um: enable UBSAN We can select ARCH_HAS_UBSAN, it works just fine. It had been enabled and we even used it, but then commit 890a64810d59 ("ubsan: Restore dependency on ARCH_HAS_UBSAN") (correctly) disabled it again, enable ARCH_HAS_UBSAN to get it. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20240701220034.995eb04d656d.Ia29fe091b207fe66b5e26298c1e427ebcf131642@changeid Signed-off-by: Johannes Berg --- arch/um/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index e08a4a1a4bb6..1faefc0a18ea 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -32,6 +32,7 @@ config UML select TTY # Needed for line.c select HAVE_ARCH_VMAP_STACK select HAVE_RUST + select ARCH_HAS_UBSAN config MMU bool From 267ed02c2121b75e0eaaa338240453b576039e4a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jul 2024 09:24:41 +0200 Subject: [PATCH 15/41] hostfs: fix dev_t handling dev_t is a kernel type and may have different definitions in kernel and userspace. On 32-bit x86 this currently makes the stat structure being 4 bytes longer in the user code, causing stack corruption. However, this is (potentially) not the only problem, since dev_t is a different type on user/kernel side, so we don't know that the major/minor encoding isn't also different. Decode/encode it instead to address both problems. Cc: stable@vger.kernel.org Fixes: 74ce793bcbde ("hostfs: Fix ephemeral inodes") Link: https://patch.msgid.link/20240702092440.acc960585dd5.Id0767e12f562a69c6cd3c3262dc3d765db350cf6@changeid Signed-off-by: Johannes Berg --- fs/hostfs/hostfs.h | 7 ++++--- fs/hostfs/hostfs_kern.c | 10 ++++++---- fs/hostfs/hostfs_user.c | 7 ++++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 0239e3af3945..8b39c15c408c 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -63,9 +63,10 @@ struct hostfs_stat { struct hostfs_timespec atime, mtime, ctime; unsigned int blksize; unsigned long long blocks; - unsigned int maj; - unsigned int min; - dev_t dev; + struct { + unsigned int maj; + unsigned int min; + } rdev, dev; }; extern int stat_file(const char *path, struct hostfs_stat *p, int fd); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index a73d27c4dd58..2c4d503a62e0 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -530,10 +530,11 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) static int hostfs_inode_set(struct inode *ino, void *data) { struct hostfs_stat *st = data; - dev_t rdev; + dev_t dev, rdev; /* Reencode maj and min with the kernel encoding.*/ - rdev = MKDEV(st->maj, st->min); + rdev = MKDEV(st->rdev.maj, st->rdev.min); + dev = MKDEV(st->dev.maj, st->dev.min); switch (st->mode & S_IFMT) { case S_IFLNK: @@ -559,7 +560,7 @@ static int hostfs_inode_set(struct inode *ino, void *data) return -EIO; } - HOSTFS_I(ino)->dev = st->dev; + HOSTFS_I(ino)->dev = dev; ino->i_ino = st->ino; ino->i_mode = st->mode; return hostfs_inode_update(ino, st); @@ -568,8 +569,9 @@ static int hostfs_inode_set(struct inode *ino, void *data) static int hostfs_inode_test(struct inode *inode, void *data) { const struct hostfs_stat *st = data; + dev_t dev = MKDEV(st->dev.maj, st->dev.min); - return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == st->dev; + return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev; } static struct inode *hostfs_iget(struct super_block *sb, char *name) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 840619e39a1a..97e9c40a9448 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -34,9 +34,10 @@ static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) p->mtime.tv_nsec = 0; p->blksize = buf->st_blksize; p->blocks = buf->st_blocks; - p->maj = os_major(buf->st_rdev); - p->min = os_minor(buf->st_rdev); - p->dev = buf->st_dev; + p->rdev.maj = os_major(buf->st_rdev); + p->rdev.min = os_minor(buf->st_rdev); + p->dev.maj = os_major(buf->st_dev); + p->dev.min = os_minor(buf->st_dev); } int stat_file(const char *path, struct hostfs_stat *p, int fd) From 6555acdefc758a4dae7038c3f2301f311e287218 Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Tue, 2 Jul 2024 19:21:18 +0200 Subject: [PATCH 16/41] um: time-travel: support time-travel protocol broadcast messages Add a message type to the time-travel protocol to broadcast a small (64-bit) value to all participants in a simulation. The main use case is to have an identical message come to all participants in a simulation, e.g. to separate out logs for different tests running in a single simulation. Down in the guts of time_travel_handle_message() we can't use printk() and not even printk_deferred(), so just store the message and print it at the start of the userspace() function. Unfortunately this means that other prints in the kernel can actually bypass the message, but in most cases where this is used, for example to separate test logs, userspace will be involved. Also, even if we could use printk_deferred(), we'd still need to flush it out in the userspace() function since otherwise userspace messages might cross it. As a result, this is a reasonable compromise, there's no need to have any core changes and it solves the main use case we have for it. Signed-off-by: Mordechay Goodstein Link: https://patch.msgid.link/20240702192118.c4093bc5b15e.I2ca8d006b67feeb866ac2017af7b741c9e06445a@changeid Signed-off-by: Johannes Berg --- arch/um/include/shared/timetravel.h | 9 +++++ arch/um/kernel/time.c | 59 +++++++++++++++++++++++++++++ arch/um/os-Linux/skas/process.c | 3 ++ include/uapi/linux/um_timetravel.h | 11 ++++++ 4 files changed, 82 insertions(+) diff --git a/arch/um/include/shared/timetravel.h b/arch/um/include/shared/timetravel.h index e5c3d69f1b69..c8db2f213dba 100644 --- a/arch/um/include/shared/timetravel.h +++ b/arch/um/include/shared/timetravel.h @@ -15,8 +15,17 @@ enum time_travel_mode { #if defined(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT) || \ defined(CONFIG_UML_TIME_TRAVEL_SUPPORT) extern enum time_travel_mode time_travel_mode; +extern int time_travel_should_print_bc_msg; #else #define time_travel_mode TT_MODE_OFF +#define time_travel_should_print_bc_msg 0 #endif /* (UML_)CONFIG_UML_TIME_TRAVEL_SUPPORT */ +void _time_travel_print_bc_msg(void); +static inline void time_travel_print_bc_msg(void) +{ + if (time_travel_should_print_bc_msg) + _time_travel_print_bc_msg(); +} + #endif /* _UM_TIME_TRAVEL_H_ */ diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 5b5fd8f68d9c..2339edc22f7c 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -60,6 +60,15 @@ enum time_travel_message_handling { TTMH_READ, }; +static u64 bc_message; +int time_travel_should_print_bc_msg; + +void _time_travel_print_bc_msg(void) +{ + time_travel_should_print_bc_msg = 0; + printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message); +} + static void time_travel_handle_message(struct um_timetravel_msg *msg, enum time_travel_message_handling mode) { @@ -101,6 +110,10 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, time_travel_ext_free_until_valid = true; time_travel_ext_free_until = msg->time; break; + case UM_TIMETRAVEL_BROADCAST: + bc_message = msg->time; + time_travel_should_print_bc_msg = 1; + break; } resp.seq = msg->seq; @@ -880,4 +893,50 @@ __uml_help(setup_time_travel_start, "time-travel-start=\n" "Configure the UML instance's wall clock to start at this value rather than\n" "the host's wall clock at the time of UML boot.\n"); +static struct kobject *bc_time_kobject; + +static ssize_t bc_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%llx", bc_message); +} + +static ssize_t bc_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret; + u64 user_bc_message; + + ret = kstrtou64(buf, 0, &user_bc_message); + if (ret) + return ret; + + bc_message = user_bc_message; + + time_travel_ext_req(UM_TIMETRAVEL_BROADCAST, bc_message); + pr_info("um: time: sent broadcast message: 0x%llx\n", bc_message); + return count; +} + +static struct kobj_attribute bc_attribute = __ATTR(bc-message, 0660, bc_show, bc_store); + +static int __init um_bc_start(void) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return 0; + + bc_time_kobject = kobject_create_and_add("um-ext-time", kernel_kobj); + if (!bc_time_kobject) + return 0; + + if (sysfs_create_file(bc_time_kobject, &bc_attribute.attr)) + pr_debug("failed to create the bc file in /sys/kernel/um_time"); + + return 0; +} + +void __exit time_exit(void) +{ + kobject_put(bc_time_kobject); +} + +late_initcall(um_bc_start); #endif diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 41a288dcfc34..45885deb6cae 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "../internal.h" int is_skas_winch(int pid, int fd, void *data) @@ -345,6 +346,8 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) interrupt_end(); while (1) { + time_travel_print_bc_msg(); + if (kill_userspace_mm[0]) fatal_sigsegv(); diff --git a/include/uapi/linux/um_timetravel.h b/include/uapi/linux/um_timetravel.h index ca3238222b6d..078ea401aa2a 100644 --- a/include/uapi/linux/um_timetravel.h +++ b/include/uapi/linux/um_timetravel.h @@ -123,6 +123,17 @@ enum um_timetravel_ops { * the simulation. */ UM_TIMETRAVEL_GET_TOD = 8, + + /** + * @UM_TIMETRAVEL_BROADCAST: Send/Receive a broadcast message. + * This message can be used to sync all components in the system + * with a single message, if the calender gets the message, the + * calender broadcast the message to all components, and if a + * component receives it it should act based on it e.g print a + * message to it's log system. + * (calendar <-> host) + */ + UM_TIMETRAVEL_BROADCAST = 9, }; #endif /* _UAPI_LINUX_UM_TIMETRAVEL_H */ From 5cde6096a4dd114b7dc5ac37a71778ea68507938 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jul 2024 19:21:19 +0200 Subject: [PATCH 17/41] um: generalize os_rcv_fd Change os_rcv_fd() to os_rcv_fd_msg() that can more generally receive any number of FDs in any kind of message. Link: https://patch.msgid.link/20240702192118.40b78b2bfe4e.Ic6ec12d72630e5bcae1e597d6bd5c6f29f441563@changeid Signed-off-by: Johannes Berg --- arch/um/drivers/port_kern.c | 14 +++++---- arch/um/drivers/xterm.c | 2 +- arch/um/drivers/xterm_kern.c | 13 ++++++-- arch/um/include/shared/os.h | 3 +- arch/um/kernel/ksyms.c | 2 +- arch/um/os-Linux/file.c | 61 +++++++++++++++++++----------------- 6 files changed, 54 insertions(+), 41 deletions(-) diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index c52b3ff3c092..a4508470df78 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -45,15 +45,17 @@ struct connection { static irqreturn_t pipe_interrupt(int irq, void *data) { struct connection *conn = data; - int fd; + int n_fds = 1, fd = -1; + ssize_t ret; - fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); - if (fd < 0) { - if (fd == -EAGAIN) + ret = os_rcv_fd_msg(conn->socket[0], &fd, n_fds, &conn->helper_pid, + sizeof(conn->helper_pid)); + if (ret != sizeof(conn->helper_pid)) { + if (ret == -EAGAIN) return IRQ_NONE; - printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", - -fd); + printk(KERN_ERR "pipe_interrupt : os_rcv_fd_msg returned %zd\n", + ret); os_close_file(conn->fd); } diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index 6918de5e2956..e4316c7981e8 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -156,7 +156,7 @@ static int xterm_open(int input, int output, int primary, void *d, new = xterm_fd(fd, &data->helper_pid); if (new < 0) { err = new; - printk(UM_KERN_ERR "xterm_open : os_rcv_fd failed, err = %d\n", + printk(UM_KERN_ERR "xterm_open : xterm_fd failed, err = %d\n", -err); goto out_kill; } diff --git a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c index 8011e51993d5..3971252cb1a6 100644 --- a/arch/um/drivers/xterm_kern.c +++ b/arch/um/drivers/xterm_kern.c @@ -21,12 +21,19 @@ struct xterm_wait { static irqreturn_t xterm_interrupt(int irq, void *data) { struct xterm_wait *xterm = data; - int fd; + int fd = -1, n_fds = 1; + ssize_t ret; - fd = os_rcv_fd(xterm->fd, &xterm->pid); - if (fd == -EAGAIN) + ret = os_rcv_fd_msg(xterm->fd, &fd, n_fds, + &xterm->pid, sizeof(xterm->pid)); + if (ret == -EAGAIN) return IRQ_NONE; + if (ret < 0) + fd = ret; + else if (ret != sizeof(xterm->pid)) + fd = -EMSGSIZE; + xterm->new_fd = fd; complete(&xterm->ready); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 6516ff5a0ecc..fc4091c127c8 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -165,7 +165,8 @@ extern int os_create_unix_socket(const char *file, int len, int close_on_exec); extern int os_shutdown_socket(int fd, int r, int w); extern int os_dup_file(int fd); extern void os_close_file(int fd); -extern int os_rcv_fd(int fd, int *helper_pid_out); +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len); extern int os_connect_socket(const char *name); extern int os_file_type(char *file); extern int os_file_mode(const char *file, struct openflags *mode_out); diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 3a85bde3e173..f2fb77da08cf 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(os_shutdown_socket); EXPORT_SYMBOL(os_create_unix_socket); EXPORT_SYMBOL(os_connect_socket); EXPORT_SYMBOL(os_accept_connection); -EXPORT_SYMBOL(os_rcv_fd); +EXPORT_SYMBOL(os_rcv_fd_msg); EXPORT_SYMBOL(run_helper); EXPORT_SYMBOL(os_major); EXPORT_SYMBOL(os_minor); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index abf8676b834c..6d3c73910946 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -512,44 +512,47 @@ int os_shutdown_socket(int fd, int r, int w) return 0; } -int os_rcv_fd(int fd, int *helper_pid_out) +/** + * os_rcv_fd_msg - receive message with (optional) FDs + * @fd: the FD to receive from + * @fds: the array for FDs to write to + * @n_fds: number of FDs to receive (@fds array size) + * @data: the message buffer + * @data_len: the size of the message to receive + * + * Receive a message with FDs. + * + * Returns: the size of the received message, or an error code + */ +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len) { - int new, n; - char buf[CMSG_SPACE(sizeof(new))]; - struct msghdr msg; + char buf[CMSG_SPACE(sizeof(*fds) * n_fds)]; struct cmsghdr *cmsg; - struct iovec iov; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - iov = ((struct iovec) { .iov_base = helper_pid_out, - .iov_len = sizeof(*helper_pid_out) }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; + struct iovec iov = { + .iov_base = data, + .iov_len = data_len, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = sizeof(buf), + }; + int n; n = recvmsg(fd, &msg, 0); if (n < 0) return -errno; - else if (n != iov.iov_len) - *helper_pid_out = -1; cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "rcv_fd didn't receive anything, " - "error = %d\n", errno); - return -1; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n"); - return -1; - } + if (!cmsg || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return n; - new = ((int *) CMSG_DATA(cmsg))[0]; - return new; + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len); + return n; } int os_create_unix_socket(const char *file, int len, int close_on_exec) From e20f9b3c59041d82199e7a622efa4495dade66c5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jul 2024 19:21:20 +0200 Subject: [PATCH 18/41] um: add mmap/mremap OS calls For the upcoming shared-memory time-travel external optimisations, we need to be able to mmap/mremap. Add the necessary OS calls. Link: https://patch.msgid.link/20240702192118.ca4472963638.Ic2da1d3a983fe57340c1b693badfa9c5bd2d8c61@changeid Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 2 ++ arch/um/os-Linux/file.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index fc4091c127c8..d269637adcd6 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -181,6 +181,8 @@ extern int os_eventfd(unsigned int initval, int flags); extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, unsigned int fds_num); int os_poll(unsigned int n, const int *fds); +void *os_mmap_rw_shared(int fd, size_t size); +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size); /* start_up.c */ extern void os_early_checks(void); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 6d3c73910946..5adf8f630049 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -718,3 +719,25 @@ int os_poll(unsigned int n, const int *fds) return -EIO; } + +void *os_mmap_rw_shared(int fd, size_t size) +{ + void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (res == MAP_FAILED) + return NULL; + + return res; +} + +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size) +{ + void *res; + + res = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE, NULL); + + if (res == MAP_FAILED) + return NULL; + + return res; +} From bfb80d8bc92fa70f5b44a57ed2b24d57685fe188 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 2 Jul 2024 19:21:21 +0200 Subject: [PATCH 19/41] um: add shared memory optimisation for time-travel=ext With external time travel, a LOT of message can end up being exchanged on the socket, taking a significant amount of time just to do that. Add a new shared memory optimisation to that, where a number of changes are made: - the controller sends a client ID and a shared memory FD (and a logging FD we don't use) in the ACK message to the initial START - the shared memory holds the current time and the free_until value, so that there's no need to exchange messages for that - if the client that's running has shared memory support, any client (the running one included) can request the next time it wants to run inside the shared memory, rather than sending a message, by also updating the free_until value - when shared memory is enabled, RUN/WAIT messages no longer have an ACK, further cutting down on messages Together, this can reduce the number of messages very significantly, and reduce overall test/simulation run time. Co-developed-by: Mordechay Goodstein Signed-off-by: Mordechay Goodstein Link: https://patch.msgid.link/20240702192118.6ad0a083f574.Ie41206c8ce4507fe26b991937f47e86c24ca7a31@changeid Signed-off-by: Johannes Berg --- arch/um/kernel/time.c | 130 +++++++++++++++++++-- include/uapi/linux/um_timetravel.h | 179 ++++++++++++++++++++++++++--- 2 files changed, 283 insertions(+), 26 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 2339edc22f7c..d44eaad53d9d 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL_GPL(time_travel_mode); static bool time_travel_start_set; static unsigned long long time_travel_start; static unsigned long long time_travel_time; +static unsigned long long time_travel_shm_offset; static LIST_HEAD(time_travel_events); static LIST_HEAD(time_travel_irqs); static unsigned long long time_travel_timer_interval; @@ -40,8 +41,11 @@ static int time_travel_ext_fd = -1; static unsigned int time_travel_ext_waiting; static bool time_travel_ext_prev_request_valid; static unsigned long long time_travel_ext_prev_request; -static bool time_travel_ext_free_until_valid; -static unsigned long long time_travel_ext_free_until; +static unsigned long long *time_travel_ext_free_until; +static unsigned long long _time_travel_ext_free_until; +static u16 time_travel_shm_id; +static struct um_timetravel_schedshm *time_travel_shm; +static union um_timetravel_schedshm_client *time_travel_shm_client; static void time_travel_set_time(unsigned long long ns) { @@ -58,6 +62,7 @@ enum time_travel_message_handling { TTMH_IDLE, TTMH_POLL, TTMH_READ, + TTMH_READ_START_ACK, }; static u64 bc_message; @@ -69,6 +74,40 @@ void _time_travel_print_bc_msg(void) printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message); } +static void time_travel_setup_shm(int fd, u16 id) +{ + u32 len; + + time_travel_shm = os_mmap_rw_shared(fd, sizeof(*time_travel_shm)); + + if (!time_travel_shm) + goto out; + + len = time_travel_shm->len; + + if (time_travel_shm->version != UM_TIMETRAVEL_SCHEDSHM_VERSION || + len < struct_size(time_travel_shm, clients, id + 1)) { + os_unmap_memory(time_travel_shm, sizeof(*time_travel_shm)); + time_travel_shm = NULL; + goto out; + } + + time_travel_shm = os_mremap_rw_shared(time_travel_shm, + sizeof(*time_travel_shm), + len); + if (!time_travel_shm) + goto out; + + time_travel_shm_offset = time_travel_shm->current_time; + time_travel_shm_client = &time_travel_shm->clients[id]; + time_travel_shm_client->capa |= UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE; + time_travel_shm_id = id; + /* always look at that free_until from now on */ + time_travel_ext_free_until = &time_travel_shm->free_until; +out: + os_close_file(fd); +} + static void time_travel_handle_message(struct um_timetravel_msg *msg, enum time_travel_message_handling mode) { @@ -89,7 +128,20 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, } } - ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + if (unlikely(mode == TTMH_READ_START_ACK)) { + int fd[UM_TIMETRAVEL_SHARED_MAX_FDS]; + + ret = os_rcv_fd_msg(time_travel_ext_fd, fd, + ARRAY_SIZE(fd), msg, sizeof(*msg)); + if (ret == sizeof(*msg)) { + time_travel_setup_shm(fd[UM_TIMETRAVEL_SHARED_MEMFD], + msg->time & UM_TIMETRAVEL_START_ACK_ID); + /* we don't use the logging for now */ + os_close_file(fd[UM_TIMETRAVEL_SHARED_LOGFD]); + } + } else { + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + } if (ret == 0) panic("time-travel external link is broken\n"); @@ -105,10 +157,20 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, return; case UM_TIMETRAVEL_RUN: time_travel_set_time(msg->time); + if (time_travel_shm) { + /* no request right now since we're running */ + time_travel_shm_client->flags &= + ~UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + /* no ack for shared memory RUN */ + return; + } break; case UM_TIMETRAVEL_FREE_UNTIL: - time_travel_ext_free_until_valid = true; - time_travel_ext_free_until = msg->time; + /* not supposed to get this with shm, but ignore it */ + if (time_travel_shm) + break; + time_travel_ext_free_until = &_time_travel_ext_free_until; + _time_travel_ext_free_until = msg->time; break; case UM_TIMETRAVEL_BROADCAST: bc_message = msg->time; @@ -149,8 +211,15 @@ static u64 time_travel_ext_req(u32 op, u64 time) block_signals_hard(); os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + /* no ACK expected for WAIT in shared memory mode */ + if (msg.op == UM_TIMETRAVEL_WAIT && time_travel_shm) + goto done; + while (msg.op != UM_TIMETRAVEL_ACK) - time_travel_handle_message(&msg, TTMH_READ); + time_travel_handle_message(&msg, + op == UM_TIMETRAVEL_START ? + TTMH_READ_START_ACK : + TTMH_READ); if (msg.seq != mseq) panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", @@ -158,6 +227,7 @@ static u64 time_travel_ext_req(u32 op, u64 time) if (op == UM_TIMETRAVEL_GET) time_travel_set_time(msg.time); +done: unblock_signals_hard(); return msg.time; @@ -193,13 +263,33 @@ static void time_travel_ext_update_request(unsigned long long time) /* * if we're running and are allowed to run past the request * then we don't need to update it either + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. */ - if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && - time < time_travel_ext_free_until) + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) return; time_travel_ext_prev_request = time; time_travel_ext_prev_request_valid = true; + + if (time_travel_shm) { + union um_timetravel_schedshm_client *running; + + running = &time_travel_shm->clients[time_travel_shm->running_id]; + + if (running->capa & UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE) { + time_travel_shm_client->flags |= + UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + time += time_travel_shm_offset; + time_travel_shm_client->req_time = time; + if (time < time_travel_shm->free_until) + time_travel_shm->free_until = time; + return; + } + } + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); } @@ -207,6 +297,14 @@ void __time_travel_propagate_time(void) { static unsigned long long last_propagated; + if (time_travel_shm) { + if (time_travel_shm->running_id != time_travel_shm_id) + panic("time-travel: setting time while not running\n"); + time_travel_shm->current_time = time_travel_time + + time_travel_shm_offset; + return; + } + if (last_propagated == time_travel_time) return; @@ -222,9 +320,12 @@ static bool time_travel_ext_request(unsigned long long time) * If we received an external sync point ("free until") then we * don't have to request/wait for anything until then, unless * we're already waiting. + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. */ - if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && - time < time_travel_ext_free_until) + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) return false; time_travel_ext_update_request(time); @@ -238,7 +339,8 @@ static void time_travel_ext_wait(bool idle) }; time_travel_ext_prev_request_valid = false; - time_travel_ext_free_until_valid = false; + if (!time_travel_shm) + time_travel_ext_free_until = NULL; time_travel_ext_waiting++; time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); @@ -261,7 +363,11 @@ static void time_travel_ext_wait(bool idle) static void time_travel_ext_get_time(void) { - time_travel_ext_req(UM_TIMETRAVEL_GET, -1); + if (time_travel_shm) + time_travel_set_time(time_travel_shm->current_time - + time_travel_shm_offset); + else + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); } static void __time_travel_update_time(unsigned long long ns, bool idle) diff --git a/include/uapi/linux/um_timetravel.h b/include/uapi/linux/um_timetravel.h index 078ea401aa2a..546a690b0346 100644 --- a/include/uapi/linux/um_timetravel.h +++ b/include/uapi/linux/um_timetravel.h @@ -1,17 +1,6 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ /* - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019 - 2023 Intel Corporation */ #ifndef _UAPI_LINUX_UM_TIMETRAVEL_H #define _UAPI_LINUX_UM_TIMETRAVEL_H @@ -50,6 +39,36 @@ struct um_timetravel_msg { __u64 time; }; +/* max number of file descriptors that can be sent/received in a message */ +#define UM_TIMETRAVEL_MAX_FDS 2 + +/** + * enum um_timetravel_shared_mem_fds - fds sent in ACK message for START message + */ +enum um_timetravel_shared_mem_fds { + /** + * @UM_TIMETRAVEL_SHARED_MEMFD: Index of the shared memory file + * descriptor in the control message + */ + UM_TIMETRAVEL_SHARED_MEMFD, + /** + * @UM_TIMETRAVEL_SHARED_LOGFD: Index of the logging file descriptor + * in the control message + */ + UM_TIMETRAVEL_SHARED_LOGFD, + UM_TIMETRAVEL_SHARED_MAX_FDS, +}; + +/** + * enum um_timetravel_start_ack - ack-time mask for start message + */ +enum um_timetravel_start_ack { + /** + * @UM_TIMETRAVEL_START_ACK_ID: client ID that controller allocated. + */ + UM_TIMETRAVEL_START_ACK_ID = 0xffff, +}; + /** * enum um_timetravel_ops - Operation codes */ @@ -57,7 +76,9 @@ enum um_timetravel_ops { /** * @UM_TIMETRAVEL_ACK: response (ACK) to any previous message, * this usually doesn't carry any data in the 'time' field - * unless otherwise specified below + * unless otherwise specified below, note: while using shared + * memory no ACK for WAIT and RUN messages, for more info see + * &struct um_timetravel_schedshm. */ UM_TIMETRAVEL_ACK = 0, @@ -136,4 +157,134 @@ enum um_timetravel_ops { UM_TIMETRAVEL_BROADCAST = 9, }; +/* version of struct um_timetravel_schedshm */ +#define UM_TIMETRAVEL_SCHEDSHM_VERSION 2 + +/** + * enum um_timetravel_schedshm_cap - time travel capabilities of every client + * + * These flags must be set immediately after processing the ACK to + * the START message, before sending any message to the controller. + */ +enum um_timetravel_schedshm_cap { + /** + * @UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE: client can read current time + * update internal time request to shared memory and read + * free until and send no Ack on RUN and doesn't expect ACK on + * WAIT. + */ + UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE = 0x1, +}; + +/** + * enum um_timetravel_schedshm_flags - time travel flags of every client + */ +enum um_timetravel_schedshm_flags { + /** + * @UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN: client has a request to run. + * It's set by client when it has a request to run, if (and only + * if) the @running_id points to a client that is able to use + * shared memory, i.e. has %UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE + * (this includes the client itself). Otherwise, a message must + * be used. + */ + UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN = 0x1, +}; + +/** + * DOC: Time travel shared memory overview + * + * The main purpose of the shared memory is to avoid all time travel message + * that don't need any action, for example current time can be held in shared + * memory without the need of any client to send a message UM_TIMETRAVEL_GET + * in order to know what's the time. + * + * Since this is shared memory with all clients and controller and controller + * creates the shared memory space, all time values are absolute to controller + * time. So first time client connects to shared memory mode it should take the + * current_time value in shared memory and keep it internally as a diff to + * shared memory times, and once shared memory is initialized, any interaction + * with the controller must happen in the controller time domain, including any + * messages (for clients that are not using shared memory, the controller will + * handle an offset and make the clients think they start at time zero.) + * + * Along with the shared memory file descriptor is sent to the client a logging + * file descriptor, to have all logs related to shared memory, + * logged into one place. note: to have all logs synced into log file at write, + * file should be flushed (fflush) after writing to it. + * + * To avoid memory corruption, we define below for each field who can write to + * it at what time, defined in the structure fields. + * + * To avoid having to pack this struct, all fields in it must be naturally aligned + * (i.e. aligned to their size). + */ + +/** + * union um_timetravel_schedshm_client - UM time travel client struct + * + * Every entity using the shared memory including the controller has a place in + * the um_timetravel_schedshm clients array, that holds info related to the client + * using the shared memory, and can be set only by the client after it gets the + * fd memory. + * + * @capa: bit fields with client capabilities see + * &enum um_timetravel_schedshm_cap, set by client once after getting the + * shared memory file descriptor. + * @flags: bit fields for flags see &enum um_timetravel_schedshm_flags for doc. + * @req_time: request time to run, set by client on every request it needs. + * @name: unique id sent to the controller by client with START message. + */ +union um_timetravel_schedshm_client { + struct { + __u32 capa; + __u32 flags; + __u64 req_time; + __u64 name; + }; + char reserve[128]; /* reserved for future usage */ +}; + +/** + * struct um_timetravel_schedshm - UM time travel shared memory struct + * + * @hdr: header fields: + * @version: Current version struct UM_TIMETRAVEL_SCHEDSHM_VERSION, + * set by controller once at init, clients must check this after mapping + * and work without shared memory if they cannot handle the indicated + * version. + * @len: Length of all the memory including header (@hdr), clients should once + * per connection first mmap the header and take the length (@len) to remap the entire size. + * This is done in order to support dynamic struct size letting number of + * clients be dynamic based on controller support. + * @free_until: Stores the next request to run by any client, in order for the + * current client to know how long it can still run. A client needs to (at + * least) reload this value immediately after communicating with any other + * client, since the controller will update this field when a new request + * is made by any client. Clients also must update this value when they + * insert/update an own request into the shared memory while not running + * themselves, and the new request is before than the current value. + * current_time: Current time, can only be set by the client in running state + * (indicated by @running_id), though that client may only run until @free_until, + * so it must remain smaller than @free_until. + * @running_id: The current client in state running, set before a client is + * notified that it's now running. + * @max_clients: size of @clients array, set once at init by the controller. + * @clients: clients array see &union um_timetravel_schedshm_client for doc, + * set only by client. + */ +struct um_timetravel_schedshm { + union { + struct { + __u32 version; + __u32 len; + __u64 free_until; + __u64 current_time; + __u16 running_id; + __u16 max_clients; + }; + char hdr[4096]; /* align to 4K page size */ + }; + union um_timetravel_schedshm_client clients[]; +}; #endif /* _UAPI_LINUX_UM_TIMETRAVEL_H */ From 36c5005f11bec763ae296732c71170aaea8ea204 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 2 Jul 2024 12:50:02 -0700 Subject: [PATCH 20/41] um: harddog: add missing MODULE_DESCRIPTION() macro With ARCH=um, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in arch/um/drivers/harddog.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://patch.msgid.link/20240702-md-um-arch-um-drivers-v1-1-79e4f50b5bab@quicinc.com Signed-off-by: Johannes Berg --- arch/um/drivers/harddog_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c index 60d1c6cab8a9..99a7144b229f 100644 --- a/arch/um/drivers/harddog_kern.c +++ b/arch/um/drivers/harddog_kern.c @@ -49,6 +49,7 @@ #include "mconsole.h" #include "harddog.h" +MODULE_DESCRIPTION("UML hardware watchdog"); MODULE_LICENSE("GPL"); static DEFINE_MUTEX(harddog_mutex); From be5d511d1a2b242b35a42f9d512bcf3de6f35bde Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 2 Jul 2024 13:27:34 -0700 Subject: [PATCH 21/41] hostfs: add missing MODULE_DESCRIPTION() macro With ARCH=um, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/hostfs/hostfs.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Link: https://patch.msgid.link/20240702-md-um-fs-hostfs-v1-1-fd2b565027e7@quicinc.com Signed-off-by: Johannes Berg --- fs/hostfs/hostfs_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2c4d503a62e0..6798d25d25a3 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -1012,4 +1012,5 @@ static void __exit exit_hostfs(void) module_init(init_hostfs) module_exit(exit_hostfs) +MODULE_DESCRIPTION("User-Mode Linux Host filesystem"); MODULE_LICENSE("GPL"); From 45610225885564dc1f962cc5be02e3b33ca81de2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 13:01:05 +0200 Subject: [PATCH 22/41] um: time-travel: remove time_exit() This function is unused and unneeded, remove it. Link: https://patch.msgid.link/20240703130105.02b3a974acb7.I7264821f7cfa17ea713b7a3e4787aa41a3107d01@changeid Signed-off-by: Johannes Berg --- arch/um/kernel/time.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index d44eaad53d9d..47b9f5e63566 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -1038,11 +1038,5 @@ static int __init um_bc_start(void) return 0; } - -void __exit time_exit(void) -{ - kobject_put(bc_time_kobject); -} - late_initcall(um_bc_start); #endif From 2cf3a3c4b84def5406b830452b1cb8bbfffe0ebe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 13:01:45 +0200 Subject: [PATCH 23/41] um: time-travel: fix signal blocking race/hang When signals are hard-blocked in order to do time-travel socket processing, we set signals_blocked and then handle SIGIO signals by setting the SIGIO bit in signals_pending. When unblocking, we first set signals_blocked to 0, and then handle all pending signals. We have to set it first, so that we can again properly block/unblock inside the unblock, if the time-travel handlers need to be processed. Unfortunately, this is racy. We can get into this situation: // signals_pending = SIGIO_MASK unblock_signals_hard() signals_blocked = 0; if (signals_pending && signals_enabled) { block_signals(); unblock_signals() ... sig_handler_common(SIGIO, NULL, NULL); sigio_handler() ... sigio_reg_handler() irq_do_timetravel_handler() reg->timetravel_handler() == vu_req_interrupt_comm_handler() vu_req_read_message() vhost_user_recv_req() vhost_user_recv() vhost_user_recv_header() // reads 12 bytes header of // 20 bytes message <-- receive SIGIO here <-- sig_handler() int enabled = signals_enabled; // 1 if ((signals_blocked || !enabled) && (sig == SIGIO)) { if (!signals_blocked && time_travel_mode == TT_MODE_EXTERNAL) sigio_run_timetravel_handlers() _sigio_handler() sigio_reg_handler() ... as above ... vhost_user_recv_header() // reads 8 bytes that were message payload // as if it were header - but aborts since // it then gets -EAGAIN ... --> end signal handler --> // continue in vhost_user_recv() // full_read() for 8 bytes payload busy loops // entire process hangs here Conceptually, to fix this, we need to ensure that the signal handler cannot run while we hard-unblock signals. The thing that makes this more complex is that we can be doing hard-block/unblock while unblocking. Introduce a new signals_blocked_pending variable that we can keep at non-zero as long as pending signals are being processed, then we only need to ensure it's decremented safely and the signal handler will only increment it if it's already non-zero (or signals_blocked is set, of course.) Note also that only the outermost call to hard-unblock is allowed to decrement signals_blocked_pending, since it could otherwise reach zero in an inner call, and leave the same race happening if the timetravel_handler loops, but that's basically required of it. Fixes: d6b399a0e02a ("um: time-travel/signals: fix ndelay() in interrupt") Link: https://patch.msgid.link/20240703110144.28034-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/os-Linux/signal.c | 118 +++++++++++++++++++++++++++++++------- 1 file changed, 98 insertions(+), 20 deletions(-) diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 787cfb9a0308..b11ed66c8bb0 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -65,9 +66,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) int signals_enabled; #ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT -static int signals_blocked; -#else -#define signals_blocked 0 +static int signals_blocked, signals_blocked_pending; #endif static unsigned int signals_pending; static unsigned int signals_active = 0; @@ -76,14 +75,27 @@ static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { int enabled = signals_enabled; - if ((signals_blocked || !enabled) && (sig == SIGIO)) { +#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT + if ((signals_blocked || + __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) && + (sig == SIGIO)) { + /* increment so unblock will do another round */ + __atomic_add_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST); + return; + } +#endif + + if (!enabled && (sig == SIGIO)) { /* * In TT_MODE_EXTERNAL, need to still call time-travel - * handlers unless signals are also blocked for the - * external time message processing. This will mark - * signals_pending by itself (only if necessary.) + * handlers. This will mark signals_pending by itself + * (only if necessary.) + * Note we won't get here if signals are hard-blocked + * (which is handled above), in that case the hard- + * unblock will handle things. */ - if (!signals_blocked && time_travel_mode == TT_MODE_EXTERNAL) + if (time_travel_mode == TT_MODE_EXTERNAL) sigio_run_timetravel_handlers(); else signals_pending |= SIGIO_MASK; @@ -380,33 +392,99 @@ int um_set_signals_trace(int enable) #ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT void mark_sigio_pending(void) { + /* + * It would seem that this should be atomic so + * it isn't a read-modify-write with a signal + * that could happen in the middle, losing the + * value set by the signal. + * + * However, this function is only called when in + * time-travel=ext simulation mode, in which case + * the only signal ever pending is SIGIO, which + * is blocked while this can be called, and the + * timer signal (SIGALRM) cannot happen. + */ signals_pending |= SIGIO_MASK; } void block_signals_hard(void) { - if (signals_blocked) - return; - signals_blocked = 1; + signals_blocked++; barrier(); } void unblock_signals_hard(void) { + static bool unblocking; + if (!signals_blocked) + panic("unblocking signals while not blocked"); + + if (--signals_blocked) return; - /* Must be set to 0 before we check the pending bits etc. */ - signals_blocked = 0; + /* + * Must be set to 0 before we check pending so the + * SIGIO handler will run as normal unless we're still + * going to process signals_blocked_pending. + */ barrier(); - if (signals_pending && signals_enabled) { - /* this is a bit inefficient, but that's not really important */ - block_signals(); - unblock_signals(); - } else if (signals_pending & SIGIO_MASK) { - /* we need to run time-travel handlers even if not enabled */ - sigio_run_timetravel_handlers(); + /* + * Note that block_signals_hard()/unblock_signals_hard() can be called + * within the unblock_signals()/sigio_run_timetravel_handlers() below. + * This would still be prone to race conditions since it's actually a + * call _within_ e.g. vu_req_read_message(), where we observed this + * issue, which loops. Thus, if the inner call handles the recorded + * pending signals, we can get out of the inner call with the real + * signal hander no longer blocked, and still have a race. Thus don't + * handle unblocking in the inner call, if it happens, but only in + * the outermost call - 'unblocking' serves as an ownership for the + * signals_blocked_pending decrement. + */ + if (unblocking) + return; + unblocking = true; + + while (__atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) { + if (signals_enabled) { + /* signals are enabled so we can touch this */ + signals_pending |= SIGIO_MASK; + /* + * this is a bit inefficient, but that's + * not really important + */ + block_signals(); + unblock_signals(); + } else { + /* + * we need to run time-travel handlers even + * if not enabled + */ + sigio_run_timetravel_handlers(); + } + + /* + * The decrement of signals_blocked_pending must be atomic so + * that the signal handler will either happen before or after + * the decrement, not during a read-modify-write: + * - If it happens before, it can increment it and we'll + * decrement it and do another round in the loop. + * - If it happens after it'll see 0 for both signals_blocked + * and signals_blocked_pending and thus run the handler as + * usual (subject to signals_enabled, but that's unrelated.) + * + * Note that a call to unblock_signals_hard() within the calls + * to unblock_signals() or sigio_run_timetravel_handlers() above + * will do nothing due to the 'unblocking' state, so this cannot + * underflow as the only one decrementing will be the outermost + * one. + */ + if (__atomic_sub_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST) < 0) + panic("signals_blocked_pending underflow"); } + + unblocking = false; } #endif From d1d3a2e69b2419d77f8f99315ac931c5ba3cb475 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:25 +0200 Subject: [PATCH 24/41] um: Remove stub-data.h include from common-offsets.h Further commits will require values from common-offsets.h inside stub-data.h. Resolve the possible circular dependency and simply use offsetof() inside stub_32.h and stub_64.h. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/common-offsets.h | 5 ----- arch/x86/um/shared/sysdep/stub_32.h | 7 ++++--- arch/x86/um/shared/sysdep/stub_64.h | 7 ++++--- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 96195483fbd0..579ed946a3a9 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* for use by sys-$SUBARCH/kernel-offsets.c */ -#include DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); @@ -30,7 +29,3 @@ DEFINE(UML_CONFIG_64BIT, CONFIG_64BIT); DEFINE(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT, CONFIG_UML_TIME_TRAVEL_SUPPORT); #endif -/* for stub */ -DEFINE(UML_STUB_FIELD_OFFSET, offsetof(struct stub_data, offset)); -DEFINE(UML_STUB_FIELD_CHILD_ERR, offsetof(struct stub_data, child_err)); -DEFINE(UML_STUB_FIELD_FD, offsetof(struct stub_data, fd)); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index ea8b5a2d67af..2748b7ee031a 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -6,6 +6,7 @@ #ifndef __SYSDEP_STUB_H #define __SYSDEP_STUB_H +#include #include #include @@ -98,9 +99,9 @@ static __always_inline void remap_stack_and_trap(void) : : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)), "g" (STUB_MMAP_NR), - "g" (UML_STUB_FIELD_FD), - "g" (UML_STUB_FIELD_OFFSET), - "g" (UML_STUB_FIELD_CHILD_ERR), + "g" (offsetof(struct stub_data, fd)), + "g" (offsetof(struct stub_data, offset)), + "g" (offsetof(struct stub_data, child_err)), "c" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE), "d" (PROT_READ | PROT_WRITE), "S" (MAP_FIXED | MAP_SHARED) diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index b24168ef0ac4..50c5e0529dfb 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -6,6 +6,7 @@ #ifndef __SYSDEP_STUB_H #define __SYSDEP_STUB_H +#include #include #include #include @@ -101,9 +102,9 @@ static __always_inline void remap_stack_and_trap(void) "g" (STUB_MMAP_NR), "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)), "g" (MAP_FIXED | MAP_SHARED), - "g" (UML_STUB_FIELD_FD), - "g" (UML_STUB_FIELD_OFFSET), - "g" (UML_STUB_FIELD_CHILD_ERR), + "g" (offsetof(struct stub_data, fd)), + "g" (offsetof(struct stub_data, offset)), + "g" (offsetof(struct stub_data, child_err)), "S" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE), "d" (PROT_READ | PROT_WRITE) : From dc26184a9d4acde988e996a437b0301b81c1ae2d Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:26 +0200 Subject: [PATCH 25/41] um: Create signal stack memory assignment in stub_data When we switch to use seccomp, we need both the signal stack and other data (i.e. syscall information) to co-exist in the stub data. To facilitate this, start by defining separate memory areas for the stack and syscall data. This moves the signal stack onto a new page as the memory area is not sufficient to hold both signal stack and syscall information. Only change the signal stack setup for now, as the syscall code will be reworked later. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/as-layout.h | 2 +- arch/um/include/shared/skas/stub-data.h | 9 +++++++++ arch/um/kernel/skas/clone.c | 6 ++++-- arch/um/kernel/skas/mmu.c | 4 ++++ arch/um/os-Linux/skas/process.c | 11 ++++++----- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index c22f46a757dc..06292fca5a4d 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -23,7 +23,7 @@ #define STUB_START stub_start #define STUB_CODE STUB_START #define STUB_DATA (STUB_CODE + UM_KERN_PAGE_SIZE) -#define STUB_DATA_PAGES 1 /* must be a power of two */ +#define STUB_DATA_PAGES 2 /* must be a power of two */ #define STUB_END (STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE) #ifndef __ASSEMBLY__ diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 5e3ade3fb38b..779d2a3bac5d 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -8,10 +8,19 @@ #ifndef __STUB_DATA_H #define __STUB_DATA_H +#include +#include + struct stub_data { unsigned long offset; int fd; long parent_err, child_err; + + /* 128 leaves enough room for additional fields in the struct */ + unsigned char syscall_data[UM_KERN_PAGE_SIZE - 128] __aligned(16); + + /* Stack for our signal handlers and for calling into . */ + unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE); }; #endif diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c index 62435187dda4..906f7454887c 100644 --- a/arch/um/kernel/skas/clone.c +++ b/arch/um/kernel/skas/clone.c @@ -27,9 +27,11 @@ stub_clone_handler(void) struct stub_data *data = get_stub_data(); long err; + /* syscall data as a temporary stack area (bottom half). */ err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - (unsigned long)data + - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE / 2); + (unsigned long) data->syscall_data + + sizeof(data->syscall_data) / 2 - + sizeof(void *)); if (err) { data->parent_err = err; goto done; diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index aeed1c2aaf3c..681839cdd795 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -14,6 +14,10 @@ #include #include #include +#include + +/* Ensure the stub_data struct covers the allocated area */ +static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); int init_new_context(struct task_struct *task, struct mm_struct *mm) { diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 45885deb6cae..56403a1b006a 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -474,11 +474,12 @@ static int __init init_thread_regs(void) thread_regs[REGS_IP_INDEX] = STUB_CODE + (unsigned long) stub_clone_handler - (unsigned long) __syscall_stub_start; - thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - - sizeof(void *); -#ifdef __SIGNAL_FRAMESIZE - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE; -#endif + + /* syscall data as a temporary stack area (top half). */ + thread_regs[REGS_SP_INDEX] = STUB_DATA + + offsetof(struct stub_data, syscall_data) + + sizeof(((struct stub_data *) 0)->syscall_data) - + sizeof(void *); return 0; } From 542dc79f6ea601788704a79ff54283c2bea265e9 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:27 +0200 Subject: [PATCH 26/41] um: Add generic stub_syscall6 function This function will be used by the new syscall handling code. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/shared/sysdep/stub_32.h | 22 ++++++++++++++++++++++ arch/x86/um/shared/sysdep/stub_64.h | 16 ++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 2748b7ee031a..ab08a69fb57f 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -80,6 +80,28 @@ static __always_inline long stub_syscall5(long syscall, long arg1, long arg2, return ret; } +static __always_inline long stub_syscall6(long syscall, long arg1, long arg2, + long arg3, long arg4, long arg5, + long arg6) +{ + struct syscall_args { + int ebx, ebp; + } args = { arg1, arg6 }; + long ret; + + __asm__ volatile ("pushl %%ebp;" + "movl 0x4(%%ebx),%%ebp;" + "movl (%%ebx),%%ebx;" + "int $0x80;" + "popl %%ebp" + : "=a" (ret) + : "0" (syscall), "b" (&args), + "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5) + : "memory"); + + return ret; +} + static __always_inline void trap_myself(void) { __asm("int3"); diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 50c5e0529dfb..d27b34d75d70 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -80,6 +80,22 @@ static __always_inline long stub_syscall5(long syscall, long arg1, long arg2, return ret; } +static __always_inline long stub_syscall6(long syscall, long arg1, long arg2, + long arg3, long arg4, long arg5, + long arg6) +{ + long ret; + + __asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9 ; " + __syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3), + "g" (arg4), "g" (arg5), "g" (arg6) + : __syscall_clobber, "r10", "r8", "r9"); + + return ret; +} + static __always_inline void trap_myself(void) { __asm("int3"); From 76ed9158e1d474e963fc59da7a461b27a2212c5a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:28 +0200 Subject: [PATCH 27/41] um: Rework syscall handling Rework syscall handling to be platform independent. Also create a clean split between queueing of syscalls and flushing them out, removing the need to keep state in the code that triggers the syscalls. The code adds syscall_data_len to the global mm_id structure. This will be used later to allow surrounding code to track whether syscalls still need to run and if errors occurred. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-5-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 22 ++- arch/um/include/shared/skas/mm_id.h | 1 + arch/um/include/shared/skas/stub-data.h | 35 +++- arch/um/include/shared/user.h | 8 + arch/um/kernel/exec.c | 10 +- arch/um/kernel/skas/Makefile | 7 +- arch/um/kernel/skas/clone.c | 2 +- arch/um/kernel/skas/stub.c | 80 ++++++++++ arch/um/kernel/tlb.c | 42 +++-- arch/um/os-Linux/skas/mem.c | 204 +++++++++++------------- arch/um/os-Linux/skas/process.c | 4 +- arch/x86/um/Makefile | 2 +- arch/x86/um/ldt.c | 41 ++--- arch/x86/um/shared/sysdep/stub.h | 1 + arch/x86/um/stub_32.S | 56 ------- arch/x86/um/stub_64.S | 50 ------ 16 files changed, 271 insertions(+), 294 deletions(-) create mode 100644 arch/um/kernel/skas/stub.c delete mode 100644 arch/x86/um/stub_32.S delete mode 100644 arch/x86/um/stub_64.S diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d269637adcd6..d1331d20fd2b 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -272,19 +272,15 @@ extern long long os_persistent_clock_emulation(void); extern long long os_nsecs(void); /* skas/mem.c */ -extern long run_syscall_stub(struct mm_id * mm_idp, - int syscall, unsigned long *args, long expected, - void **addr, int done); -extern long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr); -extern int map(struct mm_id * mm_idp, unsigned long virt, - unsigned long len, int prot, int phys_fd, - unsigned long long offset, int done, void **data); -extern int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data); -extern int protect(struct mm_id * mm_idp, unsigned long addr, - unsigned long len, unsigned int prot, int done, void **data); +int syscall_stub_flush(struct mm_id *mm_idp); +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp); + +void map(struct mm_id *mm_idp, unsigned long virt, + unsigned long len, int prot, int phys_fd, + unsigned long long offset); +void unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); +void protect(struct mm_id *mm_idp, unsigned long addr, + unsigned long len, unsigned int prot); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 92dbf727e384..4c5311abe42c 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -13,6 +13,7 @@ struct mm_id { } u; unsigned long stack; int kill; + int syscall_data_len; }; void __switch_mm(struct mm_id *mm_idp); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 779d2a3bac5d..6ffa9fb5218c 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -10,14 +10,45 @@ #include #include +#include + +#define STUB_NEXT_SYSCALL(s) \ + ((struct stub_syscall *) (((unsigned long) s) + (s)->cmd_len)) + +enum stub_syscall_type { + STUB_SYSCALL_UNSET = 0, + STUB_SYSCALL_MMAP, + STUB_SYSCALL_MUNMAP, + STUB_SYSCALL_MPROTECT, + STUB_SYSCALL_LDT, +}; + +struct stub_syscall { + union { + struct { + unsigned long addr; + unsigned long length; + unsigned long offset; + int fd; + int prot; + } mem; + struct { + user_desc_t desc; + int func; + } ldt; + }; + + enum stub_syscall_type syscall; +}; struct stub_data { unsigned long offset; int fd; - long parent_err, child_err; + long err, child_err; + int syscall_data_len; /* 128 leaves enough room for additional fields in the struct */ - unsigned char syscall_data[UM_KERN_PAGE_SIZE - 128] __aligned(16); + struct stub_syscall syscall_data[(UM_KERN_PAGE_SIZE - 128) / sizeof(struct stub_syscall)] __aligned(16); /* Stack for our signal handlers and for calling into . */ unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE); diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index 326e52450e41..bbab79c0c074 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -42,11 +42,19 @@ extern void panic(const char *fmt, ...) #define printk(...) _printk(__VA_ARGS__) extern int _printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +extern void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, _Bool ascii); #else static inline int printk(const char *fmt, ...) { return 0; } +static inline void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, _Bool ascii) +{ +} #endif extern int in_aton(char *str); diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 827a0d3fa589..5c8836b012e9 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -22,15 +22,11 @@ void flush_thread(void) { - void *data = NULL; - int ret; - arch_flush_thread(¤t->thread.arch); - ret = unmap(¤t->mm->context.id, 0, TASK_SIZE, 1, &data); - if (ret) { - printk(KERN_ERR "%s - clearing address space failed, err = %d\n", - __func__, ret); + unmap(¤t->mm->context.id, 0, TASK_SIZE); + if (syscall_stub_flush(¤t->mm->context.id) < 0) { + printk(KERN_ERR "%s - clearing address space failed", __func__); force_sig(SIGKILL); } get_safe_registers(current_pt_regs()->regs.gp, diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index f93972a25765..dd8bc2167e36 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -3,14 +3,15 @@ # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # -obj-y := clone.o mmu.o process.o syscall.o uaccess.o +obj-y := clone.o stub.o mmu.o process.o syscall.o uaccess.o -# clone.o is in the stub, so it can't be built with profiling +# clone.o and stub.o are in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := clone.o +CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) +UNPROFILE_OBJS := clone.o stub.o KCOV_INSTRUMENT := n diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c index 906f7454887c..b59fa43d68ce 100644 --- a/arch/um/kernel/skas/clone.c +++ b/arch/um/kernel/skas/clone.c @@ -33,7 +33,7 @@ stub_clone_handler(void) sizeof(data->syscall_data) / 2 - sizeof(void *)); if (err) { - data->parent_err = err; + data->err = err; goto done; } diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c new file mode 100644 index 000000000000..8773529b5048 --- /dev/null +++ b/arch/um/kernel/skas/stub.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Benjamin Berg + */ + +#include + +static __always_inline int syscall_handler(struct stub_data *d) +{ + int i; + unsigned long res; + + for (i = 0; i < d->syscall_data_len; i++) { + struct stub_syscall *sc = &d->syscall_data[i]; + + switch (sc->syscall) { + case STUB_SYSCALL_MMAP: + res = stub_syscall6(STUB_MMAP_NR, + sc->mem.addr, sc->mem.length, + sc->mem.prot, + MAP_SHARED | MAP_FIXED, + sc->mem.fd, sc->mem.offset); + if (res != sc->mem.addr) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_MUNMAP: + res = stub_syscall2(__NR_munmap, + sc->mem.addr, sc->mem.length); + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_MPROTECT: + res = stub_syscall3(__NR_mprotect, + sc->mem.addr, sc->mem.length, + sc->mem.prot); + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_LDT: + res = stub_syscall3(__NR_modify_ldt, sc->ldt.func, + (unsigned long) &sc->ldt.desc, + sizeof(sc->ldt.desc)); + /* We only write, so the expected result is zero */ + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + default: + d->err = -95; /* EOPNOTSUPP */ + d->syscall_data_len = i; + return -1; + } + } + + d->err = 0; + d->syscall_data_len = 0; + + return 0; +} + +void __section(".__syscall_stub") +stub_syscall_handler(void) +{ + struct stub_data *d = get_stub_data(); + + syscall_handler(d); + + trap_myself(); +} diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 8784f03fa4a6..a89e2886485f 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -71,21 +71,19 @@ static int do_ops(struct host_vm_change *hvc, int end, switch (op->type) { case MMAP: if (hvc->userspace) - ret = map(&hvc->mm->context.id, op->u.mmap.addr, - op->u.mmap.len, op->u.mmap.prot, - op->u.mmap.fd, - op->u.mmap.offset, finished, - &hvc->data); + map(&hvc->mm->context.id, op->u.mmap.addr, + op->u.mmap.len, op->u.mmap.prot, + op->u.mmap.fd, + op->u.mmap.offset); else map_memory(op->u.mmap.addr, op->u.mmap.offset, op->u.mmap.len, 1, 1, 1); break; case MUNMAP: if (hvc->userspace) - ret = unmap(&hvc->mm->context.id, - op->u.munmap.addr, - op->u.munmap.len, finished, - &hvc->data); + unmap(&hvc->mm->context.id, + op->u.munmap.addr, + op->u.munmap.len); else ret = os_unmap_memory( (void *) op->u.munmap.addr, @@ -94,11 +92,10 @@ static int do_ops(struct host_vm_change *hvc, int end, break; case MPROTECT: if (hvc->userspace) - ret = protect(&hvc->mm->context.id, - op->u.mprotect.addr, - op->u.mprotect.len, - op->u.mprotect.prot, - finished, &hvc->data); + protect(&hvc->mm->context.id, + op->u.mprotect.addr, + op->u.mprotect.len, + op->u.mprotect.prot); else ret = os_protect_memory( (void *) op->u.mprotect.addr, @@ -113,6 +110,9 @@ static int do_ops(struct host_vm_change *hvc, int end, } } + if (hvc->userspace && finished) + ret = syscall_stub_flush(&hvc->mm->context.id); + if (ret == -ENOMEM) report_enomem(); @@ -461,7 +461,6 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) pmd_t *pmd; pte_t *pte; struct mm_struct *mm = vma->vm_mm; - void *flush = NULL; int r, w, x, prot, err = 0; struct mm_id *mm_id; @@ -504,14 +503,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) int fd; fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); - err = map(mm_id, address, PAGE_SIZE, prot, fd, offset, - 1, &flush); - } - else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush); - } - else if (pte_newprot(*pte)) - err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush); + map(mm_id, address, PAGE_SIZE, prot, fd, offset); + } else + unmap(mm_id, address, PAGE_SIZE); + } else if (pte_newprot(*pte)) + protect(mm_id, address, PAGE_SIZE, prot); + err = syscall_stub_flush(mm_id); if (err) { if (err == -ENOMEM) report_enomem(); diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 1f9c1bffc3a6..40be9085f65b 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -19,7 +20,7 @@ #include #include "../internal.h" -extern char batch_syscall_stub[], __syscall_stub_start[]; +extern char __syscall_stub_start[]; static inline unsigned long *check_init_stack(struct mm_id * mm_idp, unsigned long *stack) @@ -36,22 +37,24 @@ static unsigned long syscall_regs[MAX_REG_NR]; static int __init init_syscall_regs(void) { get_safe_registers(syscall_regs, NULL); + syscall_regs[REGS_IP_INDEX] = STUB_CODE + - ((unsigned long) batch_syscall_stub - + ((unsigned long) stub_syscall_handler - (unsigned long) __syscall_stub_start); - syscall_regs[REGS_SP_INDEX] = STUB_DATA; + syscall_regs[REGS_SP_INDEX] = STUB_DATA + + offsetof(struct stub_data, sigstack) + + sizeof(((struct stub_data *) 0)->sigstack) - + sizeof(void *); return 0; } __initcall(init_syscall_regs); -static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) +static inline long do_syscall_stub(struct mm_id *mm_idp) { + struct stub_data *proc_data = (void *)mm_idp->stack; int n, i; - long ret, offset; - unsigned long * data; - unsigned long * syscall; int err, pid = mm_idp->u.pid; n = ptrace_setregs(pid, syscall_regs); @@ -63,6 +66,9 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) __func__, -n); } + /* Inform process how much we have filled in. */ + proc_data->syscall_data_len = mm_idp->syscall_data_len; + err = ptrace(PTRACE_CONT, pid, 0, 0); if (err) panic("Failed to continue stub, pid = %d, errno = %d\n", pid, @@ -71,135 +77,113 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) wait_stub_done(pid); /* - * When the stub stops, we find the following values on the - * beginning of the stack: - * (long )return_value - * (long )offset to failed sycall-data (0, if no error) + * proc_data->err will be non-zero if there was an (unexpected) error. + * In that case, syscall_data_len points to the last executed syscall, + * otherwise it will be zero (but we do not need to rely on that). */ - ret = *((unsigned long *) mm_idp->stack); - offset = *((unsigned long *) mm_idp->stack + 1); - if (offset) { - data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA); - printk(UM_KERN_ERR "%s : ret = %ld, offset = %ld, data = %p\n", - __func__, ret, offset, data); - syscall = (unsigned long *)((unsigned long)data + data[0]); - printk(UM_KERN_ERR "%s: syscall %ld failed, return value = 0x%lx, expected return value = 0x%lx\n", - __func__, syscall[0], ret, syscall[7]); - printk(UM_KERN_ERR " syscall parameters: 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - syscall[1], syscall[2], syscall[3], - syscall[4], syscall[5], syscall[6]); - for (n = 1; n < data[0]/sizeof(long); n++) { - if (n == 1) - printk(UM_KERN_ERR " additional syscall data:"); - if (n % 4 == 1) - printk("\n" UM_KERN_ERR " "); - printk(" 0x%lx", data[n]); - } - if (n > 1) - printk("\n"); + if (proc_data->err < 0) { + struct stub_syscall *sc; + + if (proc_data->syscall_data_len < 0 || + proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data)) + panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!", + proc_data->syscall_data_len, + mm_idp->syscall_data_len); + + sc = &proc_data->syscall_data[proc_data->syscall_data_len]; + + printk(UM_KERN_ERR "%s : length = %d, last offset = %d", + __func__, mm_idp->syscall_data_len, + proc_data->syscall_data_len); + printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n", + __func__, sc->syscall, proc_data->err); + + print_hex_dump(UM_KERN_ERR, + " syscall data: ", 0, + 16, 4, sc, sizeof(*sc), 0); + + mm_idp->syscall_data_len = proc_data->err; + } else { + mm_idp->syscall_data_len = 0; } - else ret = 0; - *addr = check_init_stack(mm_idp, NULL); - - return ret; + return mm_idp->syscall_data_len; } -long run_syscall_stub(struct mm_id * mm_idp, int syscall, - unsigned long *args, long expected, void **addr, - int done) +int syscall_stub_flush(struct mm_id *mm_idp) { - unsigned long *stack = check_init_stack(mm_idp, *addr); + int res; - *stack += sizeof(long); - stack += *stack / sizeof(long); - - *stack++ = syscall; - *stack++ = args[0]; - *stack++ = args[1]; - *stack++ = args[2]; - *stack++ = args[3]; - *stack++ = args[4]; - *stack++ = args[5]; - *stack++ = expected; - *stack = 0; - - if (!done && ((((unsigned long) stack) & ~UM_KERN_PAGE_MASK) < - UM_KERN_PAGE_SIZE - 10 * sizeof(long))) { - *addr = stack; + if (mm_idp->syscall_data_len == 0) return 0; + + /* If an error happened already, report it and reset the state. */ + if (mm_idp->syscall_data_len < 0) { + res = mm_idp->syscall_data_len; + mm_idp->syscall_data_len = 0; + return res; } - return do_syscall_stub(mm_idp, addr); + res = do_syscall_stub(mm_idp); + mm_idp->syscall_data_len = 0; + + return res; } -long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr) +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp) { - unsigned long *stack; - int ret = 0; + struct stub_syscall *sc; + struct stub_data *proc_data = (struct stub_data *) mm_idp->stack; - /* - * If *addr still is uninitialized, it *must* contain NULL. - * Thus in this case do_syscall_stub correctly won't be called. - */ - if ((((unsigned long) *addr) & ~UM_KERN_PAGE_MASK) >= - UM_KERN_PAGE_SIZE - (10 + data_count) * sizeof(long)) { - ret = do_syscall_stub(mm_idp, addr); - /* in case of error, don't overwrite data on stack */ - if (ret) - return ret; + if (mm_idp->syscall_data_len > 0 && + mm_idp->syscall_data_len == ARRAY_SIZE(proc_data->syscall_data)) + do_syscall_stub(mm_idp); + + if (mm_idp->syscall_data_len < 0) { + /* Return dummy to retain error state. */ + sc = &proc_data->syscall_data[0]; + } else { + sc = &proc_data->syscall_data[mm_idp->syscall_data_len]; + mm_idp->syscall_data_len += 1; } + memset(sc, 0, sizeof(*sc)); - stack = check_init_stack(mm_idp, *addr); - *addr = stack; - - *stack = data_count * sizeof(long); - - memcpy(stack + 1, data, data_count * sizeof(long)); - - *stub_addr = (void *)(((unsigned long)(stack + 1) & - ~UM_KERN_PAGE_MASK) + STUB_DATA); - - return 0; + return sc; } -int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, int prot, - int phys_fd, unsigned long long offset, int done, void **data) + +void map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - int ret; - unsigned long args[] = { virt, len, prot, - MAP_SHARED | MAP_FIXED, phys_fd, - MMAP_OFFSET(offset) }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt, - data, done); - - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MMAP; + sc->mem.addr = virt; + sc->mem.length = len; + sc->mem.prot = prot; + sc->mem.fd = phys_fd; + sc->mem.offset = MMAP_OFFSET(offset); } -int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data) +void unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) { - int ret; - unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0, - 0 }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0, - data, done); - - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MUNMAP; + sc->mem.addr = addr; + sc->mem.length = len; } -int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - unsigned int prot, int done, void **data) +void protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, + unsigned int prot) { - int ret; - unsigned long args[] = { addr, len, prot, 0, 0, 0 }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0, - data, done); - - return ret; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MPROTECT; + sc->mem.addr = addr; + sc->mem.length = len; + sc->mem.prot = prot; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 56403a1b006a..f49683ba186f 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -501,7 +501,7 @@ int copy_context_skas0(unsigned long new_stack, int pid) *data = ((struct stub_data) { .offset = MMAP_OFFSET(new_offset), .fd = new_fd, - .parent_err = -ESRCH, + .err = -ESRCH, .child_err = 0, }); @@ -538,7 +538,7 @@ int copy_context_skas0(unsigned long new_stack, int pid) wait_stub_done(pid); - pid = data->parent_err; + pid = data->err; if (pid < 0) { printk(UM_KERN_ERR "%s - stub-parent reports error %d\n", __func__, -pid); diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 8bc72a51b257..4559d2a693c4 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -11,7 +11,7 @@ endif obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ - stub_$(BITS).o stub_segv.o \ + stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ mem_$(BITS).o subarch.o os-Linux/ diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 255a44dd415a..c99fc23b1290 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -12,33 +12,22 @@ #include #include #include +#include static inline int modify_ldt (int func, void *ptr, unsigned long bytecount) { return syscall(__NR_modify_ldt, func, ptr, bytecount); } -static long write_ldt_entry(struct mm_id *mm_idp, int func, - struct user_desc *desc, void **addr, int done) +static void write_ldt_entry(struct mm_id *mm_idp, int func, + struct user_desc *desc) { - long res; - void *stub_addr; + struct stub_syscall *sc; - BUILD_BUG_ON(sizeof(*desc) % sizeof(long)); - - res = syscall_stub_data(mm_idp, (unsigned long *)desc, - sizeof(*desc) / sizeof(long), - addr, &stub_addr); - if (!res) { - unsigned long args[] = { func, - (unsigned long)stub_addr, - sizeof(*desc), - 0, 0, 0 }; - res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, - 0, addr, done); - } - - return res; + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_LDT; + sc->ldt.func = func; + memcpy(&sc->ldt.desc, desc, sizeof(*desc)); } /* @@ -127,7 +116,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) int i, err; struct user_desc ldt_info; struct ldt_entry entry0, *ldt_p; - void *addr = NULL; err = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -148,7 +136,8 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) mutex_lock(&ldt->lock); - err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); + write_ldt_entry(mm_idp, func, &ldt_info); + err = syscall_stub_flush(mm_idp); if (err) goto out_unlock; @@ -166,7 +155,8 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func) err = -ENOMEM; /* Undo the change in host */ memset(&ldt_info, 0, sizeof(ldt_info)); - write_ldt_entry(mm_idp, 1, &ldt_info, &addr, 1); + write_ldt_entry(mm_idp, 1, &ldt_info); + err = syscall_stub_flush(mm_idp); goto out_unlock; } if (i == 0) { @@ -303,7 +293,6 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) short * num_p; int i; long page, err=0; - void *addr = NULL; mutex_init(&new_mm->arch.ldt.lock); @@ -318,11 +307,9 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) ldt_get_host_info(); for (num_p=host_ldt_entries; *num_p != -1; num_p++) { desc.entry_number = *num_p; - err = write_ldt_entry(&new_mm->id, 1, &desc, - &addr, *(num_p + 1) == -1); - if (err) - break; + write_ldt_entry(&new_mm->id, 1, &desc); } + err = syscall_stub_flush(&new_mm->id); new_mm->arch.ldt.entry_count = 0; goto out; diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h index ce0ca46ad383..579681d12158 100644 --- a/arch/x86/um/shared/sysdep/stub.h +++ b/arch/x86/um/shared/sysdep/stub.h @@ -12,4 +12,5 @@ #endif extern void stub_segv_handler(int, siginfo_t *, void *); +extern void stub_syscall_handler(void); extern void stub_clone_handler(void); diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S deleted file mode 100644 index 8291899e6aaf..000000000000 --- a/arch/x86/um/stub_32.S +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include - -.section .__syscall_stub, "ax" - - .globl batch_syscall_stub -batch_syscall_stub: - /* %esp comes in as "top of page" */ - mov %esp, %ecx - /* %esp has pointer to first operation */ - add $8, %esp -again: - /* load length of additional data */ - mov 0x0(%esp), %eax - - /* if(length == 0) : end of list */ - /* write possible 0 to header */ - mov %eax, 0x4(%ecx) - cmpl $0, %eax - jz done - - /* save current pointer */ - mov %esp, 0x4(%ecx) - - /* skip additional data */ - add %eax, %esp - - /* load syscall-# */ - pop %eax - - /* load syscall params */ - pop %ebx - pop %ecx - pop %edx - pop %esi - pop %edi - pop %ebp - - /* execute syscall */ - int $0x80 - - /* restore top of page pointer in %ecx */ - mov %esp, %ecx - andl $(~UM_KERN_PAGE_SIZE) + 1, %ecx - - /* check return value */ - pop %ebx - cmp %ebx, %eax - je again - -done: - /* save return value */ - mov %eax, (%ecx) - - /* stop */ - int3 diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S deleted file mode 100644 index f3404640197a..000000000000 --- a/arch/x86/um/stub_64.S +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include - -.section .__syscall_stub, "ax" - .globl batch_syscall_stub -batch_syscall_stub: - /* %rsp has the pointer to first operation */ - mov %rsp, %rbx - add $0x10, %rsp -again: - /* load length of additional data */ - mov 0x0(%rsp), %rax - - /* if(length == 0) : end of list */ - /* write possible 0 to header */ - mov %rax, 8(%rbx) - cmp $0, %rax - jz done - - /* save current pointer */ - mov %rsp, 8(%rbx) - - /* skip additional data */ - add %rax, %rsp - - /* load syscall-# */ - pop %rax - - /* load syscall params */ - pop %rdi - pop %rsi - pop %rdx - pop %r10 - pop %r8 - pop %r9 - - /* execute syscall */ - syscall - - /* check return value */ - pop %rcx - cmp %rcx, %rax - je again - -done: - /* save return value */ - mov %rax, (%rbx) - - /* stop */ - int3 From 6d8992e49e2aed3ee2d73b88050f40f26ae6bf86 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:29 +0200 Subject: [PATCH 28/41] um: compress memory related stub syscalls while adding them To keep the number of syscalls that the stub has to do lower, compress two consecutive syscalls of the same type if the second is just a continuation of the first. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-6-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/os-Linux/skas/mem.c | 39 +++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 40be9085f65b..32c61189110c 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -151,12 +151,37 @@ struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp) return sc; } +static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, + int syscall_type, + unsigned long virt) +{ + if (mm_idp->syscall_data_len > 0) { + struct stub_data *proc_data = (void *) mm_idp->stack; + struct stub_syscall *sc; + + sc = &proc_data->syscall_data[mm_idp->syscall_data_len - 1]; + + if (sc->syscall == syscall_type && + sc->mem.addr + sc->mem.length == virt) + return sc; + } + + return NULL; +} void map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, int phys_fd, unsigned long long offset) { struct stub_syscall *sc; + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt); + if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd && + sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { + sc->mem.length += len; + return; + } + sc = syscall_stub_alloc(mm_idp); sc->syscall = STUB_SYSCALL_MMAP; sc->mem.addr = virt; @@ -170,6 +195,13 @@ void unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) { struct stub_syscall *sc; + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MUNMAP, addr); + if (sc) { + sc->mem.length += len; + return; + } + sc = syscall_stub_alloc(mm_idp); sc->syscall = STUB_SYSCALL_MUNMAP; sc->mem.addr = addr; @@ -181,6 +213,13 @@ void protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, { struct stub_syscall *sc; + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MPROTECT, addr); + if (sc && sc->mem.prot == prot) { + sc->mem.length += len; + return; + } + sc = syscall_stub_alloc(mm_idp); sc->syscall = STUB_SYSCALL_MPROTECT; sc->mem.addr = addr; From 7911b650a0708a3ee3412d80838b9574b21a53c8 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:30 +0200 Subject: [PATCH 29/41] um: remove LDT support The current LDT code has a few issues that mean it should be redone in a different way once we always start with a fresh MM even when cloning. In a new and better world, the kernel would just ensure its own LDT is clear at startup. At that point, all that is needed is a simple function to populate the LDT from another MM in arch_dup_mmap combined with some tracking of the installed LDT entries for each MM. Note that the old implementation was even incorrect with regard to reading, as it copied out the LDT entries in the internal format rather than converting them to the userspace structure. Removal should be fine as the LDT is not used for thread-local storage anymore. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-7-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/asm/mmu.h | 6 - arch/um/include/shared/skas/stub-data.h | 21 +- arch/um/kernel/skas/mmu.c | 8 - arch/um/kernel/skas/stub.c | 11 - arch/um/os-Linux/start_up.c | 1 + arch/x86/um/Makefile | 2 +- arch/x86/um/asm/mm_context.h | 70 ----- arch/x86/um/ldt.c | 367 ------------------------ arch/x86/um/tls_32.c | 1 + 9 files changed, 10 insertions(+), 477 deletions(-) delete mode 100644 arch/x86/um/asm/mm_context.h delete mode 100644 arch/x86/um/ldt.c diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index f2923c767bb9..6e433098d8c3 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -7,15 +7,9 @@ #define __ARCH_UM_MMU_H #include -#include typedef struct mm_context { struct mm_id id; - struct uml_arch_mm_context arch; } mm_context_t; -/* Avoid tangled inclusion with asm/ldt.h */ -extern long init_new_ldt(struct mm_context *to_mm, struct mm_context *from_mm); -extern void free_ldt(struct mm_context *mm); - #endif diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 6ffa9fb5218c..6b8caf6b8283 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -20,23 +20,16 @@ enum stub_syscall_type { STUB_SYSCALL_MMAP, STUB_SYSCALL_MUNMAP, STUB_SYSCALL_MPROTECT, - STUB_SYSCALL_LDT, }; struct stub_syscall { - union { - struct { - unsigned long addr; - unsigned long length; - unsigned long offset; - int fd; - int prot; - } mem; - struct { - user_desc_t desc; - int func; - } ldt; - }; + struct { + unsigned long addr; + unsigned long length; + unsigned long offset; + int fd; + int prot; + } mem; enum stub_syscall_type syscall; }; diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 681839cdd795..968e254cf709 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -46,13 +46,6 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) goto out_free; } - ret = init_new_ldt(to_mm, from_mm); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - init_ldt" - " failed, errno = %d\n", ret); - goto out_free; - } - return 0; out_free: @@ -80,5 +73,4 @@ void destroy_context(struct mm_struct *mm) os_kill_ptraced_process(mmu->id.u.pid, 1); free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); - free_ldt(mmu); } diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c index 8773529b5048..5d52ffa682dc 100644 --- a/arch/um/kernel/skas/stub.c +++ b/arch/um/kernel/skas/stub.c @@ -45,17 +45,6 @@ static __always_inline int syscall_handler(struct stub_data *d) return -1; } break; - case STUB_SYSCALL_LDT: - res = stub_syscall3(__NR_modify_ldt, sc->ldt.func, - (unsigned long) &sc->ldt.desc, - sizeof(sc->ldt.desc)); - /* We only write, so the expected result is zero */ - if (res) { - d->err = res; - d->syscall_data_len = i; - return -1; - } - break; default: d->err = -95; /* EOPNOTSUPP */ d->syscall_data_len = i; diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 89ad9f4f865c..93fc82c01aba 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 4559d2a693c4..17b85209c43d 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -9,7 +9,7 @@ else BITS := 64 endif -obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \ +obj-y = bugs_$(BITS).o delay.o fault.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h deleted file mode 100644 index dc32dc023c2f..000000000000 --- a/arch/x86/um/asm/mm_context.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2004 Fujitsu Siemens Computers GmbH - * Licensed under the GPL - * - * Author: Bodo Stroesser - */ - -#ifndef __ASM_LDT_H -#define __ASM_LDT_H - -#include -#include - -#define LDT_PAGES_MAX \ - ((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE) -#define LDT_ENTRIES_PER_PAGE \ - (PAGE_SIZE/LDT_ENTRY_SIZE) -#define LDT_DIRECT_ENTRIES \ - ((LDT_PAGES_MAX*sizeof(void *))/LDT_ENTRY_SIZE) - -struct ldt_entry { - __u32 a; - __u32 b; -}; - -typedef struct uml_ldt { - int entry_count; - struct mutex lock; - union { - struct ldt_entry * pages[LDT_PAGES_MAX]; - struct ldt_entry entries[LDT_DIRECT_ENTRIES]; - } u; -} uml_ldt_t; - -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) - -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - 0x7000) - -#define _LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 ) - -#ifdef CONFIG_X86_64 -#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) -#else -#define LDT_empty(info) (_LDT_empty(info)) -#endif - -struct uml_arch_mm_context { - uml_ldt_t ldt; -}; - -#endif diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c deleted file mode 100644 index c99fc23b1290..000000000000 --- a/arch/x86/um/ldt.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static inline int modify_ldt (int func, void *ptr, unsigned long bytecount) -{ - return syscall(__NR_modify_ldt, func, ptr, bytecount); -} - -static void write_ldt_entry(struct mm_id *mm_idp, int func, - struct user_desc *desc) -{ - struct stub_syscall *sc; - - sc = syscall_stub_alloc(mm_idp); - sc->syscall = STUB_SYSCALL_LDT; - sc->ldt.func = func; - memcpy(&sc->ldt.desc, desc, sizeof(*desc)); -} - -/* - * In skas mode, we hold our own ldt data in UML. - * Thus, the code implementing sys_modify_ldt_skas - * is very similar to (and mostly stolen from) sys_modify_ldt - * for arch/i386/kernel/ldt.c - * The routines copied and modified in part are: - * - read_ldt - * - read_default_ldt - * - write_ldt - * - sys_modify_ldt_skas - */ - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int i, err = 0; - unsigned long size; - uml_ldt_t *ldt = ¤t->mm->context.arch.ldt; - - if (!ldt->entry_count) - goto out; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - err = bytecount; - - mutex_lock(&ldt->lock); - if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { - size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; - if (size > bytecount) - size = bytecount; - if (copy_to_user(ptr, ldt->u.entries, size)) - err = -EFAULT; - bytecount -= size; - ptr += size; - } - else { - for (i=0; ientry_count/LDT_ENTRIES_PER_PAGE && bytecount; - i++) { - size = PAGE_SIZE; - if (size > bytecount) - size = bytecount; - if (copy_to_user(ptr, ldt->u.pages[i], size)) { - err = -EFAULT; - break; - } - bytecount -= size; - ptr += size; - } - } - mutex_unlock(&ldt->lock); - - if (bytecount == 0 || err == -EFAULT) - goto out; - - if (clear_user(ptr, bytecount)) - err = -EFAULT; - -out: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - - if (bytecount > 5*LDT_ENTRY_SIZE) - bytecount = 5*LDT_ENTRY_SIZE; - - err = bytecount; - /* - * UML doesn't support lcall7 and lcall27. - * So, we don't really have a default ldt, but emulate - * an empty ldt of common host default ldt size. - */ - if (clear_user(ptr, bytecount)) - err = -EFAULT; - - return err; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int func) -{ - uml_ldt_t *ldt = ¤t->mm->context.arch.ldt; - struct mm_id * mm_idp = ¤t->mm->context.id; - int i, err; - struct user_desc ldt_info; - struct ldt_entry entry0, *ldt_p; - - err = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - err = -EFAULT; - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) - goto out; - - err = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (func == 1) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&ldt->lock); - - write_ldt_entry(mm_idp, func, &ldt_info); - err = syscall_stub_flush(mm_idp); - if (err) - goto out_unlock; - - if (ldt_info.entry_number >= ldt->entry_count && - ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { - for (i=ldt->entry_count/LDT_ENTRIES_PER_PAGE; - i*LDT_ENTRIES_PER_PAGE <= ldt_info.entry_number; - i++) { - if (i == 0) - memcpy(&entry0, ldt->u.entries, - sizeof(entry0)); - ldt->u.pages[i] = (struct ldt_entry *) - __get_free_page(GFP_KERNEL|__GFP_ZERO); - if (!ldt->u.pages[i]) { - err = -ENOMEM; - /* Undo the change in host */ - memset(&ldt_info, 0, sizeof(ldt_info)); - write_ldt_entry(mm_idp, 1, &ldt_info); - err = syscall_stub_flush(mm_idp); - goto out_unlock; - } - if (i == 0) { - memcpy(ldt->u.pages[0], &entry0, - sizeof(entry0)); - memcpy(ldt->u.pages[0]+1, ldt->u.entries+1, - sizeof(entry0)*(LDT_DIRECT_ENTRIES-1)); - } - ldt->entry_count = (i + 1) * LDT_ENTRIES_PER_PAGE; - } - } - if (ldt->entry_count <= ldt_info.entry_number) - ldt->entry_count = ldt_info.entry_number + 1; - - if (ldt->entry_count <= LDT_DIRECT_ENTRIES) - ldt_p = ldt->u.entries + ldt_info.entry_number; - else - ldt_p = ldt->u.pages[ldt_info.entry_number/LDT_ENTRIES_PER_PAGE] + - ldt_info.entry_number%LDT_ENTRIES_PER_PAGE; - - if (ldt_info.base_addr == 0 && ldt_info.limit == 0 && - (func == 1 || LDT_empty(&ldt_info))) { - ldt_p->a = 0; - ldt_p->b = 0; - } - else{ - if (func == 1) - ldt_info.useable = 0; - ldt_p->a = LDT_entry_a(&ldt_info); - ldt_p->b = LDT_entry_b(&ldt_info); - } - err = 0; - -out_unlock: - mutex_unlock(&ldt->lock); -out: - return err; -} - -static long do_modify_ldt_skas(int func, void __user *ptr, - unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - case 0x11: - ret = write_ldt(ptr, bytecount, func); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - } - return ret; -} - -static DEFINE_SPINLOCK(host_ldt_lock); -static short dummy_list[9] = {0, -1}; -static short * host_ldt_entries = NULL; - -static void ldt_get_host_info(void) -{ - long ret; - struct ldt_entry * ldt; - short *tmp; - int i, size, k, order; - - spin_lock(&host_ldt_lock); - - if (host_ldt_entries != NULL) { - spin_unlock(&host_ldt_lock); - return; - } - host_ldt_entries = dummy_list+1; - - spin_unlock(&host_ldt_lock); - - for (i = LDT_PAGES_MAX-1, order=0; i; i>>=1, order++) - ; - - ldt = (struct ldt_entry *) - __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (ldt == NULL) { - printk(KERN_ERR "ldt_get_host_info: couldn't allocate buffer " - "for host ldt\n"); - return; - } - - ret = modify_ldt(0, ldt, (1<arch.ldt.lock); - - if (!from_mm) { - memset(&desc, 0, sizeof(desc)); - /* - * Now we try to retrieve info about the ldt, we - * inherited from the host. All ldt-entries found - * will be reset in the following loop - */ - ldt_get_host_info(); - for (num_p=host_ldt_entries; *num_p != -1; num_p++) { - desc.entry_number = *num_p; - write_ldt_entry(&new_mm->id, 1, &desc); - } - err = syscall_stub_flush(&new_mm->id); - new_mm->arch.ldt.entry_count = 0; - - goto out; - } - - /* - * Our local LDT is used to supply the data for - * modify_ldt(READLDT), if PTRACE_LDT isn't available, - * i.e., we have to use the stub for modify_ldt, which - * can't handle the big read buffer of up to 64kB. - */ - mutex_lock(&from_mm->arch.ldt.lock); - if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) - memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, - sizeof(new_mm->arch.ldt.u.entries)); - else { - i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; - while (i-->0) { - page = __get_free_page(GFP_KERNEL|__GFP_ZERO); - if (!page) { - err = -ENOMEM; - break; - } - new_mm->arch.ldt.u.pages[i] = - (struct ldt_entry *) page; - memcpy(new_mm->arch.ldt.u.pages[i], - from_mm->arch.ldt.u.pages[i], PAGE_SIZE); - } - } - new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; - mutex_unlock(&from_mm->arch.ldt.lock); - - out: - return err; -} - - -void free_ldt(struct mm_context *mm) -{ - int i; - - if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { - i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; - while (i-- > 0) - free_page((long) mm->arch.ldt.u.pages[i]); - } - mm->arch.ldt.entry_count = 0; -} - -SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , - unsigned long , bytecount) -{ - /* See non-um modify_ldt() for why we do this cast */ - return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount); -} diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index d301deee041f..fbb129023080 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -11,6 +11,7 @@ #include #include #include +#include /* * If needed we can detect when it's uninitialized. From a5d2cfe749e2917266956751262328da35e0925d Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:31 +0200 Subject: [PATCH 30/41] um: remove copy_context_skas0 The kernel flushes the memory ranges anyway for CoW and does not assume that the userspace process has anything set up already. So, start with a fresh process for the new mm context. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-8-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/include/shared/skas/stub-data.h | 1 - arch/um/kernel/skas/Makefile | 8 +- arch/um/kernel/skas/clone.c | 50 ----------- arch/um/kernel/skas/mmu.c | 20 ++--- arch/um/os-Linux/skas/process.c | 108 ------------------------ arch/x86/um/shared/sysdep/stub.h | 1 - arch/x86/um/shared/sysdep/stub_32.h | 24 ------ arch/x86/um/shared/sysdep/stub_64.h | 26 ------ 9 files changed, 10 insertions(+), 229 deletions(-) delete mode 100644 arch/um/kernel/skas/clone.c diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d1331d20fd2b..d12fec5d0b4d 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -285,7 +285,6 @@ void protect(struct mm_id *mm_idp, unsigned long addr, /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); extern int start_userspace(unsigned long stub_stack); -extern int copy_context_skas0(unsigned long stack, int pid); extern void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 6b8caf6b8283..2b6b44759dfa 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -36,7 +36,6 @@ struct stub_syscall { struct stub_data { unsigned long offset; - int fd; long err, child_err; int syscall_data_len; diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index dd8bc2167e36..6f86d53e3d69 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -3,16 +3,14 @@ # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # -obj-y := clone.o stub.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o -# clone.o and stub.o are in the stub, so it can't be built with profiling +# stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it -CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := clone.o stub.o - +UNPROFILE_OBJS := stub.o KCOV_INSTRUMENT := n include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c deleted file mode 100644 index b59fa43d68ce..000000000000 --- a/arch/um/kernel/skas/clone.c +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * This is in a separate file because it needs to be compiled with any - * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled - * - * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize - * on some systems. - */ - -void __attribute__ ((__section__ (".__syscall_stub"))) -stub_clone_handler(void) -{ - struct stub_data *data = get_stub_data(); - long err; - - /* syscall data as a temporary stack area (bottom half). */ - err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - (unsigned long) data->syscall_data + - sizeof(data->syscall_data) / 2 - - sizeof(void *)); - if (err) { - data->err = err; - goto done; - } - - err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - if (err) { - data->child_err = err; - goto done; - } - - remap_stack_and_trap(); - - done: - trap_myself(); -} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 968e254cf709..697dad49c36b 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -21,8 +21,7 @@ static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); int init_new_context(struct task_struct *task, struct mm_struct *mm) { - struct mm_context *from_mm = NULL; - struct mm_context *to_mm = &mm->context; + struct mm_id *new_id = &mm->context.id; unsigned long stack = 0; int ret = -ENOMEM; @@ -30,27 +29,22 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) if (stack == 0) goto out; - to_mm->id.stack = stack; - if (current->mm != NULL && current->mm != &init_mm) - from_mm = ¤t->mm->context; + new_id->stack = stack; block_signals_trace(); - if (from_mm) - to_mm->id.u.pid = copy_context_skas0(stack, - from_mm->id.u.pid); - else to_mm->id.u.pid = start_userspace(stack); + new_id->u.pid = start_userspace(stack); unblock_signals_trace(); - if (to_mm->id.u.pid < 0) { - ret = to_mm->id.u.pid; + if (new_id->u.pid < 0) { + ret = new_id->u.pid; goto out_free; } return 0; out_free: - if (to_mm->id.stack != 0) - free_pages(to_mm->id.stack, ilog2(STUB_DATA_PAGES)); + if (new_id->stack != 0) + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index f49683ba186f..566b8ecccc3f 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -464,114 +464,6 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) } } -static unsigned long thread_regs[MAX_REG_NR]; -static unsigned long thread_fp_regs[FP_SIZE]; - -static int __init init_thread_regs(void) -{ - get_safe_registers(thread_regs, thread_fp_regs); - /* Set parent's instruction pointer to start of clone-stub */ - thread_regs[REGS_IP_INDEX] = STUB_CODE + - (unsigned long) stub_clone_handler - - (unsigned long) __syscall_stub_start; - - /* syscall data as a temporary stack area (top half). */ - thread_regs[REGS_SP_INDEX] = STUB_DATA + - offsetof(struct stub_data, syscall_data) + - sizeof(((struct stub_data *) 0)->syscall_data) - - sizeof(void *); - return 0; -} - -__initcall(init_thread_regs); - -int copy_context_skas0(unsigned long new_stack, int pid) -{ - int err; - unsigned long current_stack = current_stub_stack(); - struct stub_data *data = (struct stub_data *) current_stack; - struct stub_data *child_data = (struct stub_data *) new_stack; - unsigned long long new_offset; - int new_fd = phys_mapping(uml_to_phys((void *)new_stack), &new_offset); - - /* - * prepare offset and fd of child's stack as argument for parent's - * and child's mmap2 calls - */ - *data = ((struct stub_data) { - .offset = MMAP_OFFSET(new_offset), - .fd = new_fd, - .err = -ESRCH, - .child_err = 0, - }); - - *child_data = ((struct stub_data) { - .child_err = -ESRCH, - }); - - err = ptrace_setregs(pid, thread_regs); - if (err < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n", - __func__, pid, -err); - return err; - } - - err = put_fp_registers(pid, thread_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n", - __func__, pid, err); - return err; - } - - /* - * Wait, until parent has finished its work: read child's pid from - * parent's stack, and check, if bad result. - */ - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) { - err = -errno; - printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n", - pid, errno); - return err; - } - - wait_stub_done(pid); - - pid = data->err; - if (pid < 0) { - printk(UM_KERN_ERR "%s - stub-parent reports error %d\n", - __func__, -pid); - return pid; - } - - /* - * Wait, until child has finished too: read child's result from - * child's stack and check it. - */ - wait_stub_done(pid); - if (child_data->child_err != STUB_DATA) { - printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n", - __func__, pid, data->child_err); - err = data->child_err; - goto out_kill; - } - - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, - (void *)PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", - __func__, errno); - goto out_kill; - } - - return pid; - - out_kill: - os_kill_ptraced_process(pid, 1); - return err; -} - void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) { (*buf)[0].JB_IP = (unsigned long) handler; diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h index 579681d12158..dc89f4423454 100644 --- a/arch/x86/um/shared/sysdep/stub.h +++ b/arch/x86/um/shared/sysdep/stub.h @@ -13,4 +13,3 @@ extern void stub_segv_handler(int, siginfo_t *, void *); extern void stub_syscall_handler(void); -extern void stub_clone_handler(void); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index ab08a69fb57f..0b44a86dd346 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -107,30 +107,6 @@ static __always_inline void trap_myself(void) __asm("int3"); } -static __always_inline void remap_stack_and_trap(void) -{ - __asm__ volatile ( - "movl %%esp,%%ebx ;" - "andl %0,%%ebx ;" - "movl %1,%%eax ;" - "movl %%ebx,%%edi ; addl %2,%%edi ; movl (%%edi),%%edi ;" - "movl %%ebx,%%ebp ; addl %3,%%ebp ; movl (%%ebp),%%ebp ;" - "int $0x80 ;" - "addl %4,%%ebx ; movl %%eax, (%%ebx) ;" - "int $3" - : : - "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)), - "g" (STUB_MMAP_NR), - "g" (offsetof(struct stub_data, fd)), - "g" (offsetof(struct stub_data, offset)), - "g" (offsetof(struct stub_data, child_err)), - "c" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE), - "d" (PROT_READ | PROT_WRITE), - "S" (MAP_FIXED | MAP_SHARED) - : - "memory"); -} - static __always_inline void *get_stub_data(void) { unsigned long ret; diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index d27b34d75d70..67f44284f1aa 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -101,32 +101,6 @@ static __always_inline void trap_myself(void) __asm("int3"); } -static __always_inline void remap_stack_and_trap(void) -{ - __asm__ volatile ( - "movq %0,%%rax ;" - "movq %%rsp,%%rdi ;" - "andq %1,%%rdi ;" - "movq %2,%%r10 ;" - "movq %%rdi,%%r8 ; addq %3,%%r8 ; movq (%%r8),%%r8 ;" - "movq %%rdi,%%r9 ; addq %4,%%r9 ; movq (%%r9),%%r9 ;" - __syscall ";" - "movq %%rsp,%%rdi ; andq %1,%%rdi ;" - "addq %5,%%rdi ; movq %%rax, (%%rdi) ;" - "int3" - : : - "g" (STUB_MMAP_NR), - "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)), - "g" (MAP_FIXED | MAP_SHARED), - "g" (offsetof(struct stub_data, fd)), - "g" (offsetof(struct stub_data, offset)), - "g" (offsetof(struct stub_data, child_err)), - "S" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE), - "d" (PROT_READ | PROT_WRITE) - : - __syscall_clobber, "r10", "r8", "r9"); -} - static __always_inline void *get_stub_data(void) { unsigned long ret; From 3c83170d7cdf3df12e430c429462776dcb52ff87 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:32 +0200 Subject: [PATCH 31/41] um: Delay flushing syscalls until the thread is restarted As running the syscalls is expensive due to context switches, we should do so as late as possible in case more syscalls need to be queued later on. This will also benefit a later move to a SECCOMP enabled userspace as in that case the need for extra context switches is removed entirely. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-9-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 4 +++ arch/um/include/shared/skas/mm_id.h | 1 - arch/um/include/shared/skas/skas.h | 1 + arch/um/kernel/skas/process.c | 8 ++++++ arch/um/kernel/tlb.c | 21 ++------------ arch/um/os-Linux/skas/mem.c | 44 ++++++++++++++++------------- arch/um/os-Linux/skas/process.c | 12 ++++++-- 7 files changed, 49 insertions(+), 42 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d12fec5d0b4d..345b117aff0b 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -195,6 +195,9 @@ extern void get_host_cpu_features( /* mem.c */ extern int create_mem_file(unsigned long long len); +/* tlb.c */ +extern void report_enomem(void); + /* process.c */ extern unsigned long os_process_pc(int pid); extern int os_process_parent(int pid); @@ -274,6 +277,7 @@ extern long long os_nsecs(void); /* skas/mem.c */ int syscall_stub_flush(struct mm_id *mm_idp); struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp); +void syscall_stub_dump_error(struct mm_id *mm_idp); void map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, int phys_fd, diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 4c5311abe42c..1e76ba40feba 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -12,7 +12,6 @@ struct mm_id { int pid; } u; unsigned long stack; - int kill; int syscall_data_len; }; diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index c93d2cbc8f32..5c78b0cc3dd4 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -15,5 +15,6 @@ extern void new_thread_handler(void); extern void handle_syscall(struct uml_pt_regs *regs); extern long execute_syscall_skas(void *r); extern unsigned long current_stub_stack(void); +extern struct mm_id *current_mm_id(void); #endif diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 99a5cbb36083..41901a74fccc 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -50,3 +50,11 @@ unsigned long current_stub_stack(void) return current->mm->context.id.stack; } + +struct mm_id *current_mm_id(void) +{ + if (current->mm == NULL) + return NULL; + + return ¤t->mm->context.id; +} diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index a89e2886485f..1eee7134cba8 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -53,7 +53,7 @@ struct host_vm_change { .index = 0, \ .force = force }) -static void report_enomem(void) +void report_enomem(void) { printk(KERN_ERR "UML ran out of memory on the host side! " "This can happen due to a memory limitation or " @@ -338,15 +338,6 @@ static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, if (!ret) ret = do_ops(&hvc, hvc.index, 1); - - /* This is not an else because ret is modified above */ - if (ret) { - struct mm_id *mm_idp = ¤t->mm->context.id; - - printk(KERN_ERR "fix_range_common: failed, killing current " - "process: %d\n", task_tgid_vnr(current)); - mm_idp->kill = 1; - } } static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) @@ -461,7 +452,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) pmd_t *pmd; pte_t *pte; struct mm_struct *mm = vma->vm_mm; - int r, w, x, prot, err = 0; + int r, w, x, prot; struct mm_id *mm_id; address &= PAGE_MASK; @@ -509,14 +500,6 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) } else if (pte_newprot(*pte)) protect(mm_id, address, PAGE_SIZE, prot); - err = syscall_stub_flush(mm_id); - if (err) { - if (err == -ENOMEM) - report_enomem(); - - goto kill; - } - *pte = pte_mkuptodate(*pte); return; diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 32c61189110c..46fa10ab9892 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -22,6 +22,29 @@ extern char __syscall_stub_start[]; +void syscall_stub_dump_error(struct mm_id *mm_idp) +{ + struct stub_data *proc_data = (void *)mm_idp->stack; + struct stub_syscall *sc; + + if (proc_data->syscall_data_len < 0 || + proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data)) + panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!", + proc_data->syscall_data_len, + mm_idp->syscall_data_len); + + sc = &proc_data->syscall_data[proc_data->syscall_data_len]; + + printk(UM_KERN_ERR "%s : length = %d, last offset = %d", + __func__, mm_idp->syscall_data_len, + proc_data->syscall_data_len); + printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n", + __func__, sc->syscall, proc_data->err); + + print_hex_dump(UM_KERN_ERR, " syscall data: ", 0, + 16, 4, sc, sizeof(*sc), 0); +} + static inline unsigned long *check_init_stack(struct mm_id * mm_idp, unsigned long *stack) { @@ -82,26 +105,9 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) * otherwise it will be zero (but we do not need to rely on that). */ if (proc_data->err < 0) { - struct stub_syscall *sc; - - if (proc_data->syscall_data_len < 0 || - proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data)) - panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!", - proc_data->syscall_data_len, - mm_idp->syscall_data_len); - - sc = &proc_data->syscall_data[proc_data->syscall_data_len]; - - printk(UM_KERN_ERR "%s : length = %d, last offset = %d", - __func__, mm_idp->syscall_data_len, - proc_data->syscall_data_len); - printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n", - __func__, sc->syscall, proc_data->err); - - print_hex_dump(UM_KERN_ERR, - " syscall data: ", 0, - 16, 4, sc, sizeof(*sc), 0); + syscall_stub_dump_error(mm_idp); + /* Store error code in case someone tries to add more syscalls */ mm_idp->syscall_data_len = proc_data->err; } else { mm_idp->syscall_data_len = 0; diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 566b8ecccc3f..9cb61610147a 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -254,7 +254,6 @@ static int userspace_tramp(void *stack) } int userspace_pid[NR_CPUS]; -int kill_userspace_mm[NR_CPUS]; /** * start_userspace() - prepare a new userspace process @@ -348,8 +347,16 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) while (1) { time_travel_print_bc_msg(); - if (kill_userspace_mm[0]) + /* Flush out any pending syscalls */ + err = syscall_stub_flush(current_mm_id()); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); fatal_sigsegv(); + } /* * This can legitimately fail if the process loads a @@ -580,5 +587,4 @@ void reboot_skas(void) void __switch_mm(struct mm_id *mm_idp) { userspace_pid[0] = mm_idp->u.pid; - kill_userspace_mm[0] = mm_idp->kill; } From 5168f6b4a4d8fb5f731f2107924f72dffeae84fc Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:33 +0200 Subject: [PATCH 32/41] um: Do not flush MM in flush_thread There should be no need to flush the memory in flush_thread. Doing this likely worked around some issue where memory was still incorrectly mapped when creating or cloning an MM. With the removal of the special clone path, that isn't relevant anymore. However, add the flush into MM initialization so that any new userspace MM is guaranteed to be clean. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-10-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/exec.c | 5 ----- arch/um/kernel/skas/mmu.c | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 5c8836b012e9..2c15bb2c104c 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -24,11 +24,6 @@ void flush_thread(void) { arch_flush_thread(¤t->thread.arch); - unmap(¤t->mm->context.id, 0, TASK_SIZE); - if (syscall_stub_flush(¤t->mm->context.id) < 0) { - printk(KERN_ERR "%s - clearing address space failed", __func__); - force_sig(SIGKILL); - } get_safe_registers(current_pt_regs()->regs.gp, current_pt_regs()->regs.fp); diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 697dad49c36b..47f98d87ea3c 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -40,6 +40,30 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) goto out_free; } + /* + * Ensure the new MM is clean and nothing unwanted is mapped. + * + * TODO: We should clear the memory up to STUB_START to ensure there is + * nothing mapped there, i.e. we (currently) have: + * + * |- user memory -|- unused -|- stub -|- unused -| + * ^ TASK_SIZE ^ STUB_START + * + * Meaning we have two unused areas where we may still have valid + * mappings from our internal clone(). That isn't really a problem as + * userspace is not going to access them, but it is definitely not + * correct. + * + * However, we are "lucky" and if rseq is configured, then on 32 bit + * it will fall into the first empty range while on 64 bit it is going + * to use an anonymous mapping in the second range. As such, things + * continue to work for now as long as we don't start unmapping these + * areas. + * + * Change this to STUB_START once we have a clean userspace. + */ + unmap(new_id, 0, TASK_SIZE); + return 0; out_free: From ef714f15027ca6f72e90d9a198c72e93b855e2a8 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:34 +0200 Subject: [PATCH 33/41] um: remove force_flush_all from fork_handler There should be no need for this. It may be that this used to work around another issue where after a clone the MM was in a bad state. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-11-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/asm/mmu_context.h | 2 -- arch/um/kernel/process.c | 2 -- arch/um/kernel/tlb.c | 44 +++++++++++-------------------- 3 files changed, 15 insertions(+), 33 deletions(-) diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index 68e2eb9cfb47..23dcc914d44e 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -13,8 +13,6 @@ #include #include -extern void force_flush_all(void); - #define activate_mm activate_mm static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) { diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 79f41f06a98f..f36b63f53bab 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -122,8 +122,6 @@ void new_thread_handler(void) /* Called magically, see new_thread_handler above */ static void fork_handler(void) { - force_flush_all(); - schedule_tail(current->thread.prev_sched); /* diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 1eee7134cba8..4f482b881adc 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -41,17 +41,15 @@ struct host_vm_change { int index; struct mm_struct *mm; void *data; - int force; }; -#define INIT_HVC(mm, force, userspace) \ +#define INIT_HVC(mm, userspace) \ ((struct host_vm_change) \ { .ops = { { .type = NONE } }, \ .mm = mm, \ .data = NULL, \ .userspace = userspace, \ - .index = 0, \ - .force = force }) + .index = 0 }) void report_enomem(void) { @@ -235,7 +233,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | (x ? UM_PROT_EXEC : 0)); - if (hvc->force || pte_newpage(*pte)) { + if (pte_newpage(*pte)) { if (pte_present(*pte)) { if (pte_newpage(*pte)) ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, @@ -261,7 +259,7 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (hvc->force || pmd_newpage(*pmd)) { + if (pmd_newpage(*pmd)) { ret = add_munmap(addr, next - addr, hvc); pmd_mkuptodate(*pmd); } @@ -283,7 +281,7 @@ static inline int update_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (hvc->force || pud_newpage(*pud)) { + if (pud_newpage(*pud)) { ret = add_munmap(addr, next - addr, hvc); pud_mkuptodate(*pud); } @@ -305,7 +303,7 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) { - if (hvc->force || p4d_newpage(*p4d)) { + if (p4d_newpage(*p4d)) { ret = add_munmap(addr, next - addr, hvc); p4d_mkuptodate(*p4d); } @@ -316,19 +314,19 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, } static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) + unsigned long end_addr) { pgd_t *pgd; struct host_vm_change hvc; unsigned long addr = start_addr, next; int ret = 0, userspace = 1; - hvc = INIT_HVC(mm, force, userspace); + hvc = INIT_HVC(mm, userspace); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end_addr); if (!pgd_present(*pgd)) { - if (force || pgd_newpage(*pgd)) { + if (pgd_newpage(*pgd)) { ret = add_munmap(addr, next - addr, &hvc); pgd_mkuptodate(*pgd); } @@ -349,11 +347,11 @@ static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) pmd_t *pmd; pte_t *pte; unsigned long addr, last; - int updated = 0, err = 0, force = 0, userspace = 0; + int updated = 0, err = 0, userspace = 0; struct host_vm_change hvc; mm = &init_mm; - hvc = INIT_HVC(mm, force, userspace); + hvc = INIT_HVC(mm, userspace); for (addr = start; addr < end;) { pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) { @@ -537,7 +535,7 @@ void __flush_tlb_one(unsigned long addr) } static void fix_range(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) + unsigned long end_addr) { /* * Don't bother flushing if this address space is about to be @@ -546,7 +544,7 @@ static void fix_range(struct mm_struct *mm, unsigned long start_addr, if (atomic_read(&mm->mm_users) == 0) return; - fix_range_common(mm, start_addr, end_addr, force); + fix_range_common(mm, start_addr, end_addr); } void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -554,7 +552,7 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, { if (vma->vm_mm == NULL) flush_tlb_kernel_range_common(start, end); - else fix_range(vma->vm_mm, start, end, 0); + else fix_range(vma->vm_mm, start, end); } EXPORT_SYMBOL(flush_tlb_range); @@ -564,17 +562,5 @@ void flush_tlb_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end, 0); -} - -void force_flush_all(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end, 1); - mmap_read_unlock(mm); + fix_range(mm, vma->vm_start, vma->vm_end); } From 573a446fc8ea4ca9be60b1db2091297da48d0a0d Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:35 +0200 Subject: [PATCH 34/41] um: simplify and consolidate TLB updates The HVC update was mostly used to compress consecutive calls into one. This is mostly relevant for userspace where it is already handled by the syscall stub code. Simplify the whole logic and consolidate it for both kernel and userspace. This does remove the sequential syscall compression for the kernel, however that shouldn't be the main factor in most runs. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-12-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 12 +- arch/um/kernel/tlb.c | 394 ++++++++---------------------------- arch/um/os-Linux/skas/mem.c | 18 +- 3 files changed, 103 insertions(+), 321 deletions(-) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 345b117aff0b..9a039d6f1f74 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -279,12 +279,12 @@ int syscall_stub_flush(struct mm_id *mm_idp); struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp); void syscall_stub_dump_error(struct mm_id *mm_idp); -void map(struct mm_id *mm_idp, unsigned long virt, - unsigned long len, int prot, int phys_fd, - unsigned long long offset); -void unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); -void protect(struct mm_id *mm_idp, unsigned long addr, - unsigned long len, unsigned int prot); +int map(struct mm_id *mm_idp, unsigned long virt, + unsigned long len, int prot, int phys_fd, + unsigned long long offset); +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); +int protect(struct mm_id *mm_idp, unsigned long addr, + unsigned long len, unsigned int prot); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 4f482b881adc..2ba1865679ae 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -15,41 +15,43 @@ #include #include -struct host_vm_change { - struct host_vm_op { - enum { NONE, MMAP, MUNMAP, MPROTECT } type; - union { - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - int fd; - __u64 offset; - } mmap; - struct { - unsigned long addr; - unsigned long len; - } munmap; - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - } mprotect; - } u; - } ops[1]; - int userspace; - int index; - struct mm_struct *mm; - void *data; +struct vm_ops { + struct mm_id *mm_idp; + + int (*mmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset); + int (*unmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len); + int (*mprotect)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, + unsigned int prot); }; -#define INIT_HVC(mm, userspace) \ - ((struct host_vm_change) \ - { .ops = { { .type = NONE } }, \ - .mm = mm, \ - .data = NULL, \ - .userspace = userspace, \ - .index = 0 }) +static int kern_map(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) +{ + /* TODO: Why is executable needed to be always set in the kernel? */ + return os_map_memory((void *)virt, phys_fd, offset, len, + prot & UM_PROT_READ, prot & UM_PROT_WRITE, + 1); +} + +static int kern_unmap(struct mm_id *mm_idp, + unsigned long virt, unsigned long len) +{ + return os_unmap_memory((void *)virt, len); +} + +static int kern_mprotect(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, + unsigned int prot) +{ + return os_protect_memory((void *)virt, len, + prot & UM_PROT_READ, prot & UM_PROT_WRITE, + 1); +} void report_enomem(void) { @@ -58,164 +60,9 @@ void report_enomem(void) "vm.max_map_count has been reached.\n"); } -static int do_ops(struct host_vm_change *hvc, int end, - int finished) -{ - struct host_vm_op *op; - int i, ret = 0; - - for (i = 0; i < end && !ret; i++) { - op = &hvc->ops[i]; - switch (op->type) { - case MMAP: - if (hvc->userspace) - map(&hvc->mm->context.id, op->u.mmap.addr, - op->u.mmap.len, op->u.mmap.prot, - op->u.mmap.fd, - op->u.mmap.offset); - else - map_memory(op->u.mmap.addr, op->u.mmap.offset, - op->u.mmap.len, 1, 1, 1); - break; - case MUNMAP: - if (hvc->userspace) - unmap(&hvc->mm->context.id, - op->u.munmap.addr, - op->u.munmap.len); - else - ret = os_unmap_memory( - (void *) op->u.munmap.addr, - op->u.munmap.len); - - break; - case MPROTECT: - if (hvc->userspace) - protect(&hvc->mm->context.id, - op->u.mprotect.addr, - op->u.mprotect.len, - op->u.mprotect.prot); - else - ret = os_protect_memory( - (void *) op->u.mprotect.addr, - op->u.mprotect.len, - 1, 1, 1); - break; - default: - printk(KERN_ERR "Unknown op type %d in do_ops\n", - op->type); - BUG(); - break; - } - } - - if (hvc->userspace && finished) - ret = syscall_stub_flush(&hvc->mm->context.id); - - if (ret == -ENOMEM) - report_enomem(); - - return ret; -} - -static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) -{ - __u64 offset; - struct host_vm_op *last; - int fd = -1, ret = 0; - - if (hvc->userspace) - fd = phys_mapping(phys, &offset); - else - offset = phys; - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MMAP) && - (last->u.mmap.addr + last->u.mmap.len == virt) && - (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) && - (last->u.mmap.offset + last->u.mmap.len == offset)) { - last->u.mmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MMAP, - .u = { .mmap = { .addr = virt, - .len = len, - .prot = prot, - .fd = fd, - .offset = offset } - } }); - return ret; -} - -static int add_munmap(unsigned long addr, unsigned long len, - struct host_vm_change *hvc) -{ - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MUNMAP) && - (last->u.munmap.addr + last->u.mmap.len == addr)) { - last->u.munmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MUNMAP, - .u = { .munmap = { .addr = addr, - .len = len } } }); - return ret; -} - -static int add_mprotect(unsigned long addr, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) -{ - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MPROTECT) && - (last->u.mprotect.addr + last->u.mprotect.len == addr) && - (last->u.mprotect.prot == prot)) { - last->u.mprotect.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MPROTECT, - .u = { .mprotect = { .addr = addr, - .len = len, - .prot = prot } } }); - return ret; -} - -#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) - static inline int update_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pte_t *pte; int r, w, x, prot, ret = 0; @@ -235,13 +82,20 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, (x ? UM_PROT_EXEC : 0)); if (pte_newpage(*pte)) { if (pte_present(*pte)) { - if (pte_newpage(*pte)) - ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, prot, hvc); + if (pte_newpage(*pte)) { + __u64 offset; + unsigned long phys = + pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + + ret = ops->mmap(ops->mm_idp, addr, + PAGE_SIZE, prot, fd, + offset); + } } else - ret = add_munmap(addr, PAGE_SIZE, hvc); + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); } else if (pte_newprot(*pte)) - ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); + ret = ops->mprotect(ops->mm_idp, addr, PAGE_SIZE, prot); *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -249,7 +103,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, static inline int update_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pmd_t *pmd; unsigned long next; @@ -260,18 +114,19 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { if (pmd_newpage(*pmd)) { - ret = add_munmap(addr, next - addr, hvc); + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pmd_mkuptodate(*pmd); } } - else ret = update_pte_range(pmd, addr, next, hvc); + else ret = update_pte_range(pmd, addr, next, ops); } while (pmd++, addr = next, ((addr < end) && !ret)); return ret; } static inline int update_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pud_t *pud; unsigned long next; @@ -282,18 +137,19 @@ static inline int update_pud_range(p4d_t *p4d, unsigned long addr, next = pud_addr_end(addr, end); if (!pud_present(*pud)) { if (pud_newpage(*pud)) { - ret = add_munmap(addr, next - addr, hvc); + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pud_mkuptodate(*pud); } } - else ret = update_pmd_range(pud, addr, next, hvc); + else ret = update_pmd_range(pud, addr, next, ops); } while (pud++, addr = next, ((addr < end) && !ret)); return ret; } static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { p4d_t *p4d; unsigned long next; @@ -304,142 +160,62 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) { if (p4d_newpage(*p4d)) { - ret = add_munmap(addr, next - addr, hvc); + ret = ops->unmap(ops->mm_idp, addr, + next - addr); p4d_mkuptodate(*p4d); } } else - ret = update_pud_range(p4d, addr, next, hvc); + ret = update_pud_range(p4d, addr, next, ops); } while (p4d++, addr = next, ((addr < end) && !ret)); return ret; } -static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, +static int fix_range_common(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { pgd_t *pgd; - struct host_vm_change hvc; + struct vm_ops ops; unsigned long addr = start_addr, next; - int ret = 0, userspace = 1; + int ret = 0; + + ops.mm_idp = &mm->context.id; + if (mm == &init_mm) { + ops.mmap = kern_map; + ops.unmap = kern_unmap; + ops.mprotect = kern_mprotect; + } else { + ops.mmap = map; + ops.unmap = unmap; + ops.mprotect = protect; + } - hvc = INIT_HVC(mm, userspace); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end_addr); if (!pgd_present(*pgd)) { if (pgd_newpage(*pgd)) { - ret = add_munmap(addr, next - addr, &hvc); + ret = ops.unmap(ops.mm_idp, addr, + next - addr); pgd_mkuptodate(*pgd); } } else - ret = update_p4d_range(pgd, addr, next, &hvc); + ret = update_p4d_range(pgd, addr, next, &ops); } while (pgd++, addr = next, ((addr < end_addr) && !ret)); - if (!ret) - ret = do_ops(&hvc, hvc.index, 1); + if (ret == -ENOMEM) + report_enomem(); + + return ret; } -static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) +static void flush_tlb_kernel_range_common(unsigned long start, unsigned long end) { - struct mm_struct *mm; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long addr, last; - int updated = 0, err = 0, userspace = 0; - struct host_vm_change hvc; + int err; - mm = &init_mm; - hvc = INIT_HVC(mm, userspace); - for (addr = start; addr < end;) { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) { - last = ADD_ROUND(addr, PGDIR_SIZE); - if (last > end) - last = end; - if (pgd_newpage(*pgd)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } + err = fix_range_common(&init_mm, start, end); - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) { - last = ADD_ROUND(addr, P4D_SIZE); - if (last > end) - last = end; - if (p4d_newpage(*p4d)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) { - last = ADD_ROUND(addr, PUD_SIZE); - if (last > end) - last = end; - if (pud_newpage(*pud)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - last = ADD_ROUND(addr, PMD_SIZE); - if (last > end) - last = end; - if (pmd_newpage(*pmd)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pte = pte_offset_kernel(pmd, addr); - if (!pte_present(*pte) || pte_newpage(*pte)) { - updated = 1; - err = add_munmap(addr, PAGE_SIZE, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - if (pte_present(*pte)) - err = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, 0, &hvc); - } - else if (pte_newprot(*pte)) { - updated = 1; - err = add_mprotect(addr, PAGE_SIZE, 0, &hvc); - } - addr += PAGE_SIZE; - } - if (!err) - err = do_ops(&hvc, hvc.index, 1); - - if (err < 0) + if (err) panic("flush_tlb_kernel failed, errno = %d\n", err); - return updated; } void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 46fa10ab9892..c55430775efd 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -175,7 +175,7 @@ static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, return NULL; } -void map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, +int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, int phys_fd, unsigned long long offset) { struct stub_syscall *sc; @@ -185,7 +185,7 @@ void map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd && sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { sc->mem.length += len; - return; + return 0; } sc = syscall_stub_alloc(mm_idp); @@ -195,9 +195,11 @@ void map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, sc->mem.prot = prot; sc->mem.fd = phys_fd; sc->mem.offset = MMAP_OFFSET(offset); + + return 0; } -void unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) { struct stub_syscall *sc; @@ -205,16 +207,18 @@ void unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MUNMAP, addr); if (sc) { sc->mem.length += len; - return; + return 0; } sc = syscall_stub_alloc(mm_idp); sc->syscall = STUB_SYSCALL_MUNMAP; sc->mem.addr = addr; sc->mem.length = len; + + return 0; } -void protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, +int protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, unsigned int prot) { struct stub_syscall *sc; @@ -223,7 +227,7 @@ void protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MPROTECT, addr); if (sc && sc->mem.prot == prot) { sc->mem.length += len; - return; + return 0; } sc = syscall_stub_alloc(mm_idp); @@ -231,4 +235,6 @@ void protect(struct mm_id *mm_idp, unsigned long addr, unsigned long len, sc->mem.addr = addr; sc->mem.length = len; sc->mem.prot = prot; + + return 0; } From bcf3d957c63d8b6d718b862fea18c5f14ce803e2 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 3 Jul 2024 15:45:36 +0200 Subject: [PATCH 35/41] um: refactor TLB update handling Conceptually, we want the memory mappings to always be up to date and represent whatever is in the TLB. To ensure that, we need to sync them over in the userspace case and for the kernel we need to process the mappings. The kernel will call flush_tlb_* if page table entries that were valid before become invalid. Unfortunately, this is not the case if entries are added. As such, change both flush_tlb_* and set_ptes to track the memory range that has to be synchronized. For the kernel, we need to execute a flush_tlb_kern_* immediately but we can wait for the first page fault in case of set_ptes. For userspace in contrast we only store that a range of memory needs to be synced and do so whenever we switch to that process. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20240703134536.1161108-13-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_kern.c | 2 - arch/um/include/asm/mmu.h | 4 + arch/um/include/asm/pgtable.h | 32 +++++++ arch/um/include/asm/tlbflush.h | 46 ++++++++-- arch/um/include/shared/skas/skas.h | 1 + arch/um/kernel/skas/process.c | 10 +++ arch/um/kernel/tlb.c | 130 +++-------------------------- arch/um/kernel/trap.c | 15 +++- arch/um/os-Linux/skas/process.c | 2 + 9 files changed, 110 insertions(+), 132 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 1542a6d6a7a1..c186aa77cda4 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include "mconsole_kern.h" #include @@ -770,7 +769,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); goto error; } - flush_tlb_kernel_vm(); err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap, ubd_dev->cow.bitmap_offset, diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index 6e433098d8c3..a3eaca41ff61 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -10,6 +10,10 @@ typedef struct mm_context { struct mm_id id; + + /* Address range in need of a TLB sync */ + unsigned long sync_tlb_range_from; + unsigned long sync_tlb_range_to; } mm_context_t; #endif diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index e1ece21dbe3f..5bb397b65efb 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -244,6 +244,38 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) #define PFN_PTE_SHIFT PAGE_SHIFT +static inline void um_tlb_mark_sync(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + if (!mm->context.sync_tlb_range_to) { + mm->context.sync_tlb_range_from = start; + mm->context.sync_tlb_range_to = end; + } else { + if (start < mm->context.sync_tlb_range_from) + mm->context.sync_tlb_range_from = start; + if (end > mm->context.sync_tlb_range_to) + mm->context.sync_tlb_range_to = end; + } +} + +#define set_ptes set_ptes +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int nr) +{ + /* Basically the default implementation */ + size_t length = nr * PAGE_SIZE; + + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); + } + + um_tlb_mark_sync(mm, addr, addr + length); +} + #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/arch/um/include/asm/tlbflush.h b/arch/um/include/asm/tlbflush.h index a5bda890390d..db997976b6ea 100644 --- a/arch/um/include/asm/tlbflush.h +++ b/arch/um/include/asm/tlbflush.h @@ -9,23 +9,51 @@ #include /* - * TLB flushing: + * In UML, we need to sync the TLB over by using mmap/munmap/mprotect syscalls + * from the process handling the MM (which can be the kernel itself). + * + * To track updates, we can hook into set_ptes and flush_tlb_*. With set_ptes + * we catch all PTE transitions where memory that was unusable becomes usable. + * While with flush_tlb_* we can track any memory that becomes unusable and + * even if a higher layer of the page table was modified. + * + * So, we simply track updates using both methods and mark the memory area to + * be synced later on. The only special case is that flush_tlb_kern_* needs to + * be executed immediately as there is no good synchronization point in that + * case. In contrast, in the set_ptes case we can wait for the next kernel + * segfault before we do the synchornization. * - * - flush_tlb() flushes the current mm struct TLBs * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_kernel_vm() flushes the kernel vm area * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages */ +extern int um_tlb_sync(struct mm_struct *mm); + extern void flush_tlb_all(void); extern void flush_tlb_mm(struct mm_struct *mm); -extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end); -extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long address); -extern void flush_tlb_kernel_vm(void); -extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -extern void __flush_tlb_one(unsigned long addr); + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long address) +{ + um_tlb_mark_sync(vma->vm_mm, address, address + PAGE_SIZE); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + um_tlb_mark_sync(vma->vm_mm, start, end); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + um_tlb_mark_sync(&init_mm, start, end); + + /* Kernel needs to be synced immediately */ + um_tlb_sync(&init_mm); +} #endif diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index 5c78b0cc3dd4..ebaa116de30b 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -16,5 +16,6 @@ extern void handle_syscall(struct uml_pt_regs *regs); extern long execute_syscall_skas(void *r); extern unsigned long current_stub_stack(void); extern struct mm_id *current_mm_id(void); +extern void current_mm_sync(void); #endif diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 41901a74fccc..5f9c1c5f36e2 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -8,6 +8,8 @@ #include #include +#include + #include #include #include @@ -58,3 +60,11 @@ struct mm_id *current_mm_id(void) return ¤t->mm->context.id; } + +void current_mm_sync(void) +{ + if (current->mm == NULL) + return; + + um_tlb_sync(current->mm); +} diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 2ba1865679ae..44c6fc697f3a 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -170,14 +170,16 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, return ret; } -static int fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr) +int um_tlb_sync(struct mm_struct *mm) { pgd_t *pgd; struct vm_ops ops; - unsigned long addr = start_addr, next; + unsigned long addr = mm->context.sync_tlb_range_from, next; int ret = 0; + if (mm->context.sync_tlb_range_to == 0) + return 0; + ops.mm_idp = &mm->context.id; if (mm == &init_mm) { ops.mmap = kern_map; @@ -191,7 +193,7 @@ static int fix_range_common(struct mm_struct *mm, unsigned long start_addr, pgd = pgd_offset(mm, addr); do { - next = pgd_addr_end(addr, end_addr); + next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); if (!pgd_present(*pgd)) { if (pgd_newpage(*pgd)) { ret = ops.unmap(ops.mm_idp, addr, @@ -200,89 +202,18 @@ static int fix_range_common(struct mm_struct *mm, unsigned long start_addr, } } else ret = update_p4d_range(pgd, addr, next, &ops); - } while (pgd++, addr = next, ((addr < end_addr) && !ret)); + } while (pgd++, addr = next, + ((addr < mm->context.sync_tlb_range_to) && !ret)); if (ret == -ENOMEM) report_enomem(); + mm->context.sync_tlb_range_from = 0; + mm->context.sync_tlb_range_to = 0; + return ret; } -static void flush_tlb_kernel_range_common(unsigned long start, unsigned long end) -{ - int err; - - err = fix_range_common(&init_mm, start, end); - - if (err) - panic("flush_tlb_kernel failed, errno = %d\n", err); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - struct mm_struct *mm = vma->vm_mm; - int r, w, x, prot; - struct mm_id *mm_id; - - address &= PAGE_MASK; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto kill; - - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - goto kill; - - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - goto kill; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - goto kill; - - pte = pte_offset_kernel(pmd, address); - - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) { - w = 0; - } - - mm_id = &mm->context.id; - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - unsigned long long offset; - int fd; - - fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); - map(mm_id, address, PAGE_SIZE, prot, fd, offset); - } else - unmap(mm_id, address, PAGE_SIZE); - } else if (pte_newprot(*pte)) - protect(mm_id, address, PAGE_SIZE, prot); - - *pte = pte_mkuptodate(*pte); - - return; - -kill: - printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); - force_sig(SIGKILL); -} - void flush_tlb_all(void) { /* @@ -295,48 +226,11 @@ void flush_tlb_all(void) flush_tlb_mm(current->mm); } -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_kernel_range_common(start, end); -} - -void flush_tlb_kernel_vm(void) -{ - flush_tlb_kernel_range_common(start_vm, end_vm); -} - -void __flush_tlb_one(unsigned long addr) -{ - flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); -} - -static void fix_range(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr) -{ - /* - * Don't bother flushing if this address space is about to be - * destroyed. - */ - if (atomic_read(&mm->mm_users) == 0) - return; - - fix_range_common(mm, start_addr, end_addr); -} - -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_mm == NULL) - flush_tlb_kernel_range_common(start, end); - else fix_range(vma->vm_mm, start, end); -} -EXPORT_SYMBOL(flush_tlb_range); - void flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end); + um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end); } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 6d8ae86ae978..97c8df9c4401 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -113,7 +113,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif - flush_tlb_page(vma, address); + out: mmap_read_unlock(mm); out_nosemaphore: @@ -210,8 +210,17 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, if (!is_user && regs) current->thread.segv_regs = container_of(regs, struct pt_regs, regs); - if (!is_user && (address >= start_vm) && (address < end_vm)) { - flush_tlb_kernel_vm(); + if (!is_user && init_mm.context.sync_tlb_range_to) { + /* + * Kernel has pending updates from set_ptes that were not + * flushed yet. Syncing them should fix the pagefault (if not + * we'll get here again and panic). + */ + err = um_tlb_sync(&init_mm); + if (err == -ENOMEM) + report_enomem(); + if (err) + panic("Failed to sync kernel TLBs: %d", err); goto out; } else if (current->mm == NULL) { diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 9cb61610147a..f7088345b3fc 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -347,6 +347,8 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) while (1) { time_travel_print_bc_msg(); + current_mm_sync(); + /* Flush out any pending syscalls */ err = syscall_stub_flush(current_mm_id()); if (err) { From cd01672d64a358cccc087c8b845ebe6b7e9e2ca4 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Tue, 2 Jul 2024 10:25:50 +0200 Subject: [PATCH 36/41] um: Enable preemption in UML Since userspace state is saved in the MM process, kernel using FPU still doesn't really need to do anything, so this really is as simple as enabling preemption. The irq critical section in sigio_handler() needs preempt_disable()/preempt_enable(). Signed-off-by: Anton Ivanov Link: https://patch.msgid.link/20240702102549.d2fcea450854.I12f5a53d80ec1e425e66ef272b1e95cb523b608e@changeid [rebase, remove FPU save/restore, fix x86/um Makefile, rewrite commit message] Signed-off-by: Johannes Berg --- arch/um/Kconfig | 2 +- arch/um/kernel/irq.c | 2 ++ arch/x86/um/Makefile | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 1faefc0a18ea..dca84fd6d00a 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -11,7 +11,7 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_NO_PREEMPT + select ARCH_NO_PREEMPT_DYNAMIC select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index ceda4bd2e5ed..534e91797f89 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -238,7 +238,9 @@ static void _sigio_handler(struct uml_pt_regs *regs, void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) { + preempt_disable(); _sigio_handler(regs, irqs_suspended); + preempt_enable(); } static struct irq_entry *get_irq_entry_by_fd(int fd) diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 17b85209c43d..36e67fc97c22 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -31,7 +31,6 @@ obj-y += syscalls_64.o vdso/ subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \ ../lib/memmove_64.o ../lib/memset_64.o -subarch-$(CONFIG_PREEMPTION) += ../entry/thunk_64.o endif From bc94745d042bff9c5888451816d1a1efc7566fc3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 17:20:14 +0200 Subject: [PATCH 37/41] um: remove pcap driver from documentation We just removed the driver, but forgot the row in the documentation table mentioning it. Remove that. Link: https://patch.msgid.link/20240703172013.c3232dc5cbe9.I7f9941095d45a39d0f1e83680a408b3c6d445242@changeid Acked-By: Anton Ivanov Signed-off-by: Johannes Berg --- Documentation/virt/uml/user_mode_linux_howto_v2.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/virt/uml/user_mode_linux_howto_v2.rst b/Documentation/virt/uml/user_mode_linux_howto_v2.rst index d1cfe415e4c4..27942446f406 100644 --- a/Documentation/virt/uml/user_mode_linux_howto_v2.rst +++ b/Documentation/virt/uml/user_mode_linux_howto_v2.rst @@ -223,8 +223,6 @@ remote UML and other VM instances. +-----------+--------+------------------------------------+------------+ | socket | legacy | none | ~ 450Mbit | +-----------+--------+------------------------------------+------------+ -| pcap | legacy | rx only | ~ 450Mbit | -+-----------+--------+------------------------------------+------------+ | ethertap | legacy | obsolete | ~ 500Mbit | +-----------+--------+------------------------------------+------------+ | vde | legacy | obsolete | ~ 500Mbit | From 824ac4a5edd3f7494ab1996826c4f47f8ef0f63d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 17:22:36 +0200 Subject: [PATCH 38/41] um: line: always fill *error_out in setup_one_line() The pointer isn't initialized by callers, but I have encountered cases where it's still printed; initialize it in all possible cases in setup_one_line(). Link: https://patch.msgid.link/20240703172235.ad863568b55f.Iaa1eba4db8265d7715ba71d5f6bb8c7ff63d27e9@changeid Acked-By: Anton Ivanov Signed-off-by: Johannes Berg --- arch/um/drivers/line.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index d82bc3fdb86e..43d8959cc746 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -383,6 +383,7 @@ int setup_one_line(struct line *lines, int n, char *init, parse_chan_pair(NULL, line, n, opts, error_out); err = 0; } + *error_out = "configured as 'none'"; } else { char *new = kstrdup(init, GFP_KERNEL); if (!new) { @@ -406,6 +407,7 @@ int setup_one_line(struct line *lines, int n, char *init, } } if (err) { + *error_out = "failed to parse channel pair"; line->init_str = NULL; line->valid = 0; kfree(new); From 86abcd6eeb56e49311aea2f4695af0b3b5eaeb26 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 17:38:40 +0200 Subject: [PATCH 39/41] um: register power-off handler Otherwise we always get reboot: Power off not available: System halted instead which is really quite pointless. Link: https://patch.msgid.link/20240703173839.fcbb538c6686.I3d333f4773cff93c4337c4d128ee0b1b501b3dfa@changeid Acked-By: Anton Ivanov Signed-off-by: Johannes Berg --- arch/um/kernel/reboot.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index 25840eee1068..3736bca626ba 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -59,3 +59,18 @@ void machine_halt(void) { machine_power_off(); } + +static int sys_power_off_handler(struct sys_off_data *data) +{ + machine_power_off(); + return 0; +} + +static int register_power_off(void) +{ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + sys_power_off_handler, NULL); + return 0; +} +__initcall(register_power_off); From a0470a9f699a847e519293db9e33a4dae169467d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 18:46:07 +0200 Subject: [PATCH 40/41] um: vector: remove vp->lock This lock is useless, all the places that are using it for some locking will already hold the RTNL. Just remove it. Link: https://patch.msgid.link/20240703184606.19aa35b14959.I9cf5f2c4e35abd06cc89bf2e990fa755eb8e5f0f@changeid Acked-By: Anton Ivanov Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 16 +--------------- arch/um/drivers/vector_kern.h | 1 - 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 4279793b11b7..0861e32d2fbd 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1115,7 +1115,6 @@ static int irq_rr; static int vector_net_close(struct net_device *dev) { struct vector_private *vp = netdev_priv(dev); - unsigned long flags; netif_stop_queue(dev); del_timer(&vp->tl); @@ -1158,10 +1157,8 @@ static int vector_net_close(struct net_device *dev) destroy_queue(vp->tx_queue); kfree(vp->fds); vp->fds = NULL; - spin_lock_irqsave(&vp->lock, flags); vp->opened = false; vp->in_error = false; - spin_unlock_irqrestore(&vp->lock, flags); return 0; } @@ -1203,17 +1200,12 @@ static void vector_reset_tx(struct work_struct *work) static int vector_net_open(struct net_device *dev) { struct vector_private *vp = netdev_priv(dev); - unsigned long flags; int err = -EINVAL; struct vector_device *vdevice; - spin_lock_irqsave(&vp->lock, flags); - if (vp->opened) { - spin_unlock_irqrestore(&vp->lock, flags); + if (vp->opened) return -ENXIO; - } vp->opened = true; - spin_unlock_irqrestore(&vp->lock, flags); vp->bpf = uml_vector_user_bpf(get_bpf_file(vp->parsed)); @@ -1387,8 +1379,6 @@ static int vector_net_load_bpf_flash(struct net_device *dev, return -1; } - spin_lock(&vp->lock); - if (vp->bpf != NULL) { if (vp->opened) uml_vector_detach_bpf(vp->fds->rx_fd, vp->bpf); @@ -1417,15 +1407,12 @@ static int vector_net_load_bpf_flash(struct net_device *dev, if (vp->opened) result = uml_vector_attach_bpf(vp->fds->rx_fd, vp->bpf); - spin_unlock(&vp->lock); - return result; free_buffer: release_firmware(fw); flash_fail: - spin_unlock(&vp->lock); if (vp->bpf != NULL) kfree(vp->bpf->filter); kfree(vp->bpf); @@ -1631,7 +1618,6 @@ static void vector_eth_configure( INIT_WORK(&vp->reset_tx, vector_reset_tx); timer_setup(&vp->tl, vector_timer_expire, 0); - spin_lock_init(&vp->lock); /* FIXME */ dev->netdev_ops = &vector_netdev_ops; diff --git a/arch/um/drivers/vector_kern.h b/arch/um/drivers/vector_kern.h index 2a1fa8e0f3e1..806df551be0b 100644 --- a/arch/um/drivers/vector_kern.h +++ b/arch/um/drivers/vector_kern.h @@ -71,7 +71,6 @@ struct vector_estats { struct vector_private { struct list_head list; - spinlock_t lock; struct net_device *dev; struct napi_struct napi ____cacheline_aligned; From 98ff534ec2cd02496c166614e6c1391d8e092e51 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jul 2024 18:46:23 +0200 Subject: [PATCH 41/41] um: vector: always reset vp->opened If open fails, we have already set vp->opened, but it's not reset so that any further attempts will just return -ENXIO. Reset vp->opened even if close has nothing to do. This helps e.g. with slirp4netns handling only a single connection, you can then restart it and open the device again. Link: https://patch.msgid.link/20240703184622.df40c5c38461.Id4e20b48938c6019d99e6133227a34ac059db466@changeid Acked-By: Anton Ivanov Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 0861e32d2fbd..2d473282ab51 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1119,6 +1119,8 @@ static int vector_net_close(struct net_device *dev) netif_stop_queue(dev); del_timer(&vp->tl); + vp->opened = false; + if (vp->fds == NULL) return 0; @@ -1157,7 +1159,6 @@ static int vector_net_close(struct net_device *dev) destroy_queue(vp->tx_queue); kfree(vp->fds); vp->fds = NULL; - vp->opened = false; vp->in_error = false; return 0; }