linux-stable/arch/x86/xen/enlighten_pv.c
Linus Torvalds b18d62891a Merge branch 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 APIC updates from Thomas Gleixner:
 "This update provides a major overhaul of the APIC initialization and
  vector allocation code:

   - Unification of the APIC and interrupt mode setup which was
     scattered all over the place and was hard to follow. This also
     distangles the timer setup from the APIC initialization which
     brings a clear separation of functionality.

     Great detective work from Dou Lyiang!

   - Refactoring of the x86 vector allocation mechanism. The existing
     code was based on nested loops and rather convoluted APIC callbacks
     which had a horrible worst case behaviour and tried to serve all
     different use cases in one go. This led to quite odd hacks when
     supporting the new managed interupt facility for multiqueue devices
     and made it more or less impossible to deal with the vector space
     exhaustion which was a major roadblock for server hibernation.

     Aside of that the code dealing with cpu hotplug and the system
     vectors was disconnected from the actual vector management and
     allocation code, which made it hard to follow and maintain.

     Utilizing the new bitmap matrix allocator core mechanism, the new
     allocator and management code consolidates the handling of system
     vectors, legacy vectors, cpu hotplug mechanisms and the actual
     allocation which needs to be aware of system and legacy vectors and
     hotplug constraints into a single consistent entity.

     This has one visible change: The support for multi CPU targets of
     interrupts, which is only available on a certain subset of
     CPUs/APIC variants has been removed in favour of single interrupt
     targets. A proper analysis of the multi CPU target feature revealed
     that there is no real advantage as the vast majority of interrupts
     end up on the CPU with the lowest APIC id in the set of target CPUs
     anyway. That change was agreed on by the relevant folks and allowed
     to simplify the implementation significantly and to replace rather
     fragile constructs like the vector cleanup IPI with straight
     forward and solid code.

     Furthermore this allowed to cleanly separate the allocation details
     for legacy, normal and managed interrupts:

      * Legacy interrupts are not longer wasting 16 vectors
        unconditionally

      * Managed interrupts have now a guaranteed vector reservation, but
        the actual vector assignment happens when the interrupt is
        requested. It's guaranteed not to fail.

      * Normal interrupts no longer allocate vectors unconditionally
        when the interrupt is set up (IO/APIC init or MSI(X) enable).
        The mechanism has been switched to a best effort reservation
        mode. The actual allocation happens when the interrupt is
        requested. Contrary to managed interrupts the request can fail
        due to vector space exhaustion, but drivers must handle a fail
        of request_irq() anyway. When the interrupt is freed, the vector
        is handed back as well.

        This solves a long standing problem with large unconditional
        vector allocations for a certain class of enterprise devices
        which prevented server hibernation due to vector space
        exhaustion when the unused allocated vectors had to be migrated
        to CPU0 while unplugging all non boot CPUs.

     The code has been equipped with trace points and detailed debugfs
     information to aid analysis of the vector space"

* 'x86-apic-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
  x86/vector/msi: Select CONFIG_GENERIC_IRQ_RESERVATION_MODE
  PCI/MSI: Set MSI_FLAG_MUST_REACTIVATE in core code
  genirq: Add config option for reservation mode
  x86/vector: Use correct per cpu variable in free_moved_vector()
  x86/apic/vector: Ignore set_affinity call for inactive interrupts
  x86/apic: Fix spelling mistake: "symmectic" -> "symmetric"
  x86/apic: Use dead_cpu instead of current CPU when cleaning up
  ACPI/init: Invoke early ACPI initialization earlier
  x86/vector: Respect affinity mask in irq descriptor
  x86/irq: Simplify hotplug vector accounting
  x86/vector: Switch IOAPIC to global reservation mode
  x86/vector/msi: Switch to global reservation mode
  x86/vector: Handle managed interrupts proper
  x86/io_apic: Reevaluate vector configuration on activate()
  iommu/amd: Reevaluate vector configuration on activate()
  iommu/vt-d: Reevaluate vector configuration on activate()
  x86/apic/msi: Force reactivation of interrupts at startup time
  x86/vector: Untangle internal state from irq_cfg
  x86/vector: Compile SMP only code conditionally
  x86/apic: Remove unused callbacks
  ...
2017-11-13 18:29:23 -08:00

1468 lines
36 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Core of Xen paravirt_ops implementation.
*
* This file contains the xen_paravirt_ops structure itself, and the
* implementations for:
* - privileged instructions
* - interrupt flags
* - segment operations
* - booting and setup
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/preempt.h>
#include <linux/hardirq.h>
#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
#include <linux/kprobes.h>
#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
#include <linux/console.h>
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
#include <linux/edd.h>
#include <linux/frame.h>
#include <xen/xen.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
#include <xen/interface/nmi.h>
#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvc-console.h>
#include <xen/acpi.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
#include <asm/page.h>
#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/cpuid.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/msr-index.h>
#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mach_traps.h>
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
#include <acpi/pdc_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
#include "xen-ops.h"
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
#include "pmu.h"
void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
struct tls_descs {
struct desc_struct desc[3];
};
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
* changed. Since Xen writes different descriptors than the one
* passed in the update_descriptor hypercall we keep shadow copies to
* compare against.
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
static void __init xen_banner(void)
{
unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
struct xen_extraversion extra;
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
/* Check if running on Xen version (major, minor) or later */
bool
xen_running_on_version_or_later(unsigned int major, unsigned int minor)
{
unsigned int version;
if (!xen_domain())
return false;
version = HYPERVISOR_xen_version(XENVER_version, NULL);
if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
((version >> 16) > major))
return true;
return false;
}
static __read_mostly unsigned int cpuid_leaf5_ecx_val;
static __read_mostly unsigned int cpuid_leaf5_edx_val;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
unsigned maskebx = ~0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
switch (*ax) {
case CPUID_MWAIT_LEAF:
/* Synthesize the values.. */
*ax = 0;
*bx = 0;
*cx = cpuid_leaf5_ecx_val;
*dx = cpuid_leaf5_edx_val;
return;
case 0xb:
/* Suppress extended topology stuff */
maskebx = 0;
break;
}
asm(XEN_EMULATE_PREFIX "cpuid"
: "=a" (*ax),
"=b" (*bx),
"=c" (*cx),
"=d" (*dx)
: "0" (*ax), "2" (*cx));
*bx &= maskebx;
}
STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
static bool __init xen_check_mwait(void)
{
#ifdef CONFIG_ACPI
struct xen_platform_op op = {
.cmd = XENPF_set_processor_pminfo,
.u.set_pminfo.id = -1,
.u.set_pminfo.type = XEN_PM_PDC,
};
uint32_t buf[3];
unsigned int ax, bx, cx, dx;
unsigned int mwait_mask;
/* We need to determine whether it is OK to expose the MWAIT
* capability to the kernel to harvest deeper than C3 states from ACPI
* _CST using the processor_harvest_xen.c module. For this to work, we
* need to gather the MWAIT_LEAF values (which the cstate.c code
* checks against). The hypervisor won't expose the MWAIT flag because
* it would break backwards compatibility; so we will find out directly
* from the hardware and hypercall.
*/
if (!xen_initial_domain())
return false;
/*
* When running under platform earlier than Xen4.2, do not expose
* mwait, to avoid the risk of loading native acpi pad driver
*/
if (!xen_running_on_version_or_later(4, 2))
return false;
ax = 1;
cx = 0;
native_cpuid(&ax, &bx, &cx, &dx);
mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
(1 << (X86_FEATURE_MWAIT % 32));
if ((cx & mwait_mask) != mwait_mask)
return false;
/* We need to emulate the MWAIT_LEAF and for that we need both
* ecx and edx. The hypercall provides only partial information.
*/
ax = CPUID_MWAIT_LEAF;
bx = 0;
cx = 0;
dx = 0;
native_cpuid(&ax, &bx, &cx, &dx);
/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
if ((HYPERVISOR_platform_op(&op) == 0) &&
(buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
}
return true;
#else
return false;
#endif
}
static bool __init xen_check_xsave(void)
{
unsigned int cx, xsave_mask;
cx = cpuid_ecx(1);
xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
(1 << (X86_FEATURE_OSXSAVE % 32));
/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
return (cx & xsave_mask) == xsave_mask;
}
static void __init xen_init_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_XENPV);
setup_clear_cpu_cap(X86_FEATURE_DCA);
setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
setup_clear_cpu_cap(X86_FEATURE_MTRR);
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
setup_clear_cpu_cap(X86_FEATURE_SME);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
* as xen_flush_tlb_others() would need updating.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
if (!xen_initial_domain())
setup_clear_cpu_cap(X86_FEATURE_ACPI);
if (xen_check_mwait())
setup_force_cpu_cap(X86_FEATURE_MWAIT);
else
setup_clear_cpu_cap(X86_FEATURE_MWAIT);
if (!xen_check_xsave()) {
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
}
}
static void xen_set_debugreg(int reg, unsigned long val)
{
HYPERVISOR_set_debugreg(reg, val);
}
static unsigned long xen_get_debugreg(int reg)
{
return HYPERVISOR_get_debugreg(reg);
}
static void xen_end_context_switch(struct task_struct *next)
{
xen_mc_flush();
paravirt_end_context_switch(next);
}
static unsigned long xen_store_tr(void)
{
return 0;
}
/*
* Set the page permissions for a particular virtual address. If the
* address is a vmalloc mapping (or other non-linear mapping), then
* find the linear mapping of the page and also set its protections to
* match.
*/
static void set_aliased_prot(void *v, pgprot_t prot)
{
int level;
pte_t *ptep;
pte_t pte;
unsigned long pfn;
struct page *page;
unsigned char dummy;
ptep = lookup_address((unsigned long)v, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
page = pfn_to_page(pfn);
pte = pfn_pte(pfn, prot);
/*
* Careful: update_va_mapping() will fail if the virtual address
* we're poking isn't populated in the page tables. We don't
* need to worry about the direct map (that's always in the page
* tables), but we need to be careful about vmap space. In
* particular, the top level page table can lazily propagate
* entries between processes, so if we've switched mms since we
* vmapped the target in the first place, we might not have the
* top-level page table entry populated.
*
* We disable preemption because we want the same mm active when
* we probe the target and when we issue the hypercall. We'll
* have the same nominal mm, but if we're a kernel thread, lazy
* mm dropping could change our pgd.
*
* Out of an abundance of caution, this uses __get_user() to fault
* in the target address just in case there's some obscure case
* in which the target address isn't readable.
*/
preempt_disable();
probe_kernel_read(&dummy, v, 1);
if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
BUG();
if (!PageHighMem(page)) {
void *av = __va(PFN_PHYS(pfn));
if (av != v)
if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
BUG();
} else
kmap_flush_unused();
preempt_enable();
}
static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
int i;
/*
* We need to mark the all aliases of the LDT pages RO. We
* don't need to call vm_flush_aliases(), though, since that's
* only responsible for flushing aliases out the TLBs, not the
* page tables, and Xen will flush the TLB for us if needed.
*
* To avoid confusing future readers: none of this is necessary
* to load the LDT. The hypervisor only checks this when the
* LDT is faulted in due to subsequent descriptor access.
*/
for (i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
{
const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
int i;
for (i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL);
}
static void xen_set_ldt(const void *addr, unsigned entries)
{
struct mmuext_op *op;
struct multicall_space mcs = xen_mc_entry(sizeof(*op));
trace_xen_cpu_set_ldt(addr, entries);
op = mcs.args;
op->cmd = MMUEXT_SET_LDT;
op->arg1.linear_addr = (unsigned long)addr;
op->arg2.nr_ents = entries;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
static void xen_load_gdt(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
unsigned long frames[pages];
int f;
/*
* A GDT can be up to 64k in size, which corresponds to 8192
* 8-byte entries, or 16 4k pages..
*/
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
int level;
pte_t *ptep;
unsigned long pfn, mfn;
void *virt;
/*
* The GDT is per-cpu and is in the percpu data area.
* That can be virtually mapped, so we need to do a
* page-walk to get the underlying MFN for the
* hypercall. The page can also be in the kernel's
* linear range, so we need to RO that mapping too.
*/
ptep = lookup_address(va, &level);
BUG_ON(ptep == NULL);
pfn = pte_pfn(*ptep);
mfn = pfn_to_mfn(pfn);
virt = __va(PFN_PHYS(pfn));
frames[f] = mfn;
make_lowmem_page_readonly((void *)va);
make_lowmem_page_readonly(virt);
}
if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
BUG();
}
/*
* load_gdt for early boot, when the gdt is only mapped once
*/
static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
{
unsigned long va = dtr->address;
unsigned int size = dtr->size + 1;
unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE);
unsigned long frames[pages];
int f;
/*
* A GDT can be up to 64k in size, which corresponds to 8192
* 8-byte entries, or 16 4k pages..
*/
BUG_ON(size > 65536);
BUG_ON(va & ~PAGE_MASK);
for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
pte_t pte;
unsigned long pfn, mfn;
pfn = virt_to_pfn(va);
mfn = pfn_to_mfn(pfn);
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
BUG();
frames[f] = mfn;
}
if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
BUG();
}
static inline bool desc_equal(const struct desc_struct *d1,
const struct desc_struct *d2)
{
return !memcmp(d1, d2, sizeof(*d1));
}
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
struct desc_struct *gdt;
xmaddr_t maddr;
struct multicall_space mc;
if (desc_equal(shadow, &t->tls_array[i]))
return;
*shadow = t->tls_array[i];
gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
* XXX sleazy hack: If we're being called in a lazy-cpu zone
* and lazy gs handling is enabled, it means we're in a
* context switch, and %gs has just been saved. This means we
* can zero it out to prevent faults on exit from the
* hypervisor if the next process has no %gs. Either way, it
* has been saved, and the new value will get loaded properly.
* This will go away as soon as Xen has been modified to not
* save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
* must not zero %gs on x86_64
*
* For x86_64, we need to zero %fs, otherwise we may get an
* exception between the new %fs descriptor being loaded and
* %fs being effectively cleared at __switch_to().
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif
}
xen_mc_batch();
load_TLS_descriptor(t, cpu, 0);
load_TLS_descriptor(t, cpu, 1);
load_TLS_descriptor(t, cpu, 2);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
#ifdef CONFIG_X86_64
static void xen_load_gs_index(unsigned int idx)
{
if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
BUG();
}
#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
{
xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
preempt_disable();
xen_mc_flush();
if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
BUG();
preempt_enable();
}
#ifdef CONFIG_X86_64
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
bool ist_okay;
};
static struct trap_array_entry trap_array[] = {
{ debug, xen_xendebug, true },
{ int3, xen_xenint3, true },
{ double_fault, xen_double_fault, true },
#ifdef CONFIG_X86_MCE
{ machine_check, xen_machine_check, true },
#endif
{ nmi, xen_xennmi, true },
{ overflow, xen_overflow, false },
#ifdef CONFIG_IA32_EMULATION
{ entry_INT80_compat, xen_entry_INT80_compat, false },
#endif
{ page_fault, xen_page_fault, false },
{ divide_error, xen_divide_error, false },
{ bounds, xen_bounds, false },
{ invalid_op, xen_invalid_op, false },
{ device_not_available, xen_device_not_available, false },
{ coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
{ invalid_TSS, xen_invalid_TSS, false },
{ segment_not_present, xen_segment_not_present, false },
{ stack_segment, xen_stack_segment, false },
{ general_protection, xen_general_protection, false },
{ spurious_interrupt_bug, xen_spurious_interrupt_bug, false },
{ coprocessor_error, xen_coprocessor_error, false },
{ alignment_check, xen_alignment_check, false },
{ simd_coprocessor_error, xen_simd_coprocessor_error, false },
};
static bool get_trap_addr(void **addr, unsigned int ist)
{
unsigned int nr;
bool ist_okay = false;
/*
* Replace trap handler addresses by Xen specific ones.
* Check for known traps using IST and whitelist them.
* The debugger ones are the only ones we care about.
* Xen will handle faults like double_fault, * so we should never see
* them. Warn if there's an unexpected IST-using fault handler.
*/
for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
struct trap_array_entry *entry = trap_array + nr;
if (*addr == entry->orig) {
*addr = entry->xen;
ist_okay = entry->ist_okay;
break;
}
}
if (WARN_ON(ist != 0 && !ist_okay))
return false;
return true;
}
#endif
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
unsigned long addr;
if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
return 0;
info->vector = vector;
addr = gate_offset(val);
#ifdef CONFIG_X86_64
if (!get_trap_addr((void **)&addr, val->bits.ist))
return 0;
#endif /* CONFIG_X86_64 */
info->address = addr;
info->cs = gate_segment(val);
info->flags = val->bits.dpl;
/* interrupt gates clear IF */
if (val->bits.type == GATE_INTERRUPT)
info->flags |= 1 << 2;
return 1;
}
/* Locations of each CPU's IDT */
static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
/* Set an IDT entry. If the entry is part of the current IDT, then
also update Xen. */
static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
{
unsigned long p = (unsigned long)&dt[entrynum];
unsigned long start, end;
trace_xen_cpu_write_idt_entry(dt, entrynum, g);
preempt_disable();
start = __this_cpu_read(idt_desc.address);
end = start + __this_cpu_read(idt_desc.size) + 1;
xen_mc_flush();
native_write_idt_entry(dt, entrynum, g);
if (p >= start && (p + 8) <= end) {
struct trap_info info[2];
info[1].address = 0;
if (cvt_gate_to_trap(entrynum, g, &info[0]))
if (HYPERVISOR_set_trap_table(info))
BUG();
}
preempt_enable();
}
static void xen_convert_trap_info(const struct desc_ptr *desc,
struct trap_info *traps)
{
unsigned in, out, count;
count = (desc->size+1) / sizeof(gate_desc);
BUG_ON(count > 256);
for (in = out = 0; in < count; in++) {
gate_desc *entry = (gate_desc *)(desc->address) + in;
if (cvt_gate_to_trap(in, entry, &traps[out]))
out++;
}
traps[out].address = 0;
}
void xen_copy_trap_info(struct trap_info *traps)
{
const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
xen_convert_trap_info(desc, traps);
}
/* Load a new IDT into Xen. In principle this can be per-CPU, so we
hold a spinlock to protect the static traps[] array (static because
it avoids allocation, and saves stack space). */
static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
trace_xen_cpu_load_idt(desc);
spin_lock(&lock);
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
xen_convert_trap_info(desc, traps);
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
BUG();
spin_unlock(&lock);
}
/* Write a GDT descriptor entry. Ignore LDT descriptors, since
they're handled differently. */
static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
const void *desc, int type)
{
trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
preempt_disable();
switch (type) {
case DESC_LDT:
case DESC_TSS:
/* ignore */
break;
default: {
xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
xen_mc_flush();
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
BUG();
}
}
preempt_enable();
}
/*
* Version of write_gdt_entry for use at early boot-time needed to
* update an entry as simply as possible.
*/
static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
const void *desc, int type)
{
trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
switch (type) {
case DESC_LDT:
case DESC_TSS:
/* ignore */
break;
default: {
xmaddr_t maddr = virt_to_machine(&dt[entry]);
if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
dt[entry] = *(struct desc_struct *)desc;
}
}
}
static void xen_load_sp0(unsigned long sp0)
{
struct multicall_space mcs;
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;
/* Force the change at ring 0. */
set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
}
static void xen_io_delay(void)
{
}
static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
static unsigned long xen_read_cr0(void)
{
unsigned long cr0 = this_cpu_read(xen_cr0_value);
if (unlikely(cr0 == 0)) {
cr0 = native_read_cr0();
this_cpu_write(xen_cr0_value, cr0);
}
return cr0;
}
static void xen_write_cr0(unsigned long cr0)
{
struct multicall_space mcs;
this_cpu_write(xen_cr0_value, cr0);
/* Only pay attention to cr0.TS; everything else is
ignored. */
mcs = xen_mc_entry(0);
MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
static void xen_write_cr4(unsigned long cr4)
{
cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
#ifdef CONFIG_X86_64
static inline unsigned long xen_read_cr8(void)
{
return 0;
}
static inline void xen_write_cr8(unsigned long val)
{
BUG_ON(val);
}
#endif
static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
if (pmu_msr_read(msr, &val, err))
return val;
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
#ifdef CONFIG_X86_X2APIC
if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
#endif
val &= ~X2APIC_ENABLE;
break;
}
return val;
}
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
ret = 0;
switch (msr) {
#ifdef CONFIG_X86_64
unsigned which;
u64 base;
case MSR_FS_BASE: which = SEGBASE_FS; goto set;
case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
set:
base = ((u64)high << 32) | low;
if (HYPERVISOR_set_segment_base(which, base) != 0)
ret = -EIO;
break;
#endif
case MSR_STAR:
case MSR_CSTAR:
case MSR_LSTAR:
case MSR_SYSCALL_MASK:
case MSR_IA32_SYSENTER_CS:
case MSR_IA32_SYSENTER_ESP:
case MSR_IA32_SYSENTER_EIP:
/* Fast syscall setup is all done in hypercalls, so
these are all ignored. Stub them out here to stop
Xen console noise. */
break;
default:
if (!pmu_msr_write(msr, low, high, &ret))
ret = native_write_msr_safe(msr, low, high);
}
return ret;
}
static u64 xen_read_msr(unsigned int msr)
{
/*
* This will silently swallow a #GP from RDMSR. It may be worth
* changing that.
*/
int err;
return xen_read_msr_safe(msr, &err);
}
static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
{
/*
* This will silently swallow a #GP from WRMSR. It may be worth
* changing that.
*/
xen_write_msr_safe(msr, low, high);
}
void xen_setup_shared_info(void)
{
set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
HYPERVISOR_shared_info =
(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
xen_setup_mfn_list_list();
if (system_state == SYSTEM_BOOTING) {
#ifndef CONFIG_SMP
/*
* In UP this is as good a place as any to set up shared info.
* Limit this to boot only, at restore vcpu setup is done via
* xen_vcpu_restore().
*/
xen_setup_vcpu_info_placement();
#endif
/*
* Now that shared info is set up we can start using routines
* that point to pvclock area.
*/
xen_init_time_ops();
}
}
/* This is called once we have the cpu_possible_mask */
void __ref xen_setup_vcpu_info_placement(void)
{
int cpu;
for_each_possible_cpu(cpu) {
/* Set up direct vCPU id mapping for PV guests. */
per_cpu(xen_vcpu_id, cpu) = cpu;
/*
* xen_vcpu_setup(cpu) can fail -- in which case it
* falls back to the shared_info version for cpus
* where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
*
* xen_cpu_up_prepare_pv() handles the rest by failing
* them in hotplug.
*/
(void) xen_vcpu_setup(cpu);
}
/*
* xen_vcpu_setup managed to place the vcpu_info within the
* percpu area for all cpus, so make use of it.
*/
if (xen_have_vcpu_info_placement) {
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
}
static const struct pv_info xen_info __initconst = {
.shared_kernel_pmd = 0,
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
#endif
.name = "Xen",
};
static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
.get_debugreg = xen_get_debugreg,
.read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
.write_cr4 = xen_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = xen_read_cr8,
.write_cr8 = xen_write_cr8,
#endif
.wbinvd = native_wbinvd,
.read_msr = xen_read_msr,
.write_msr = xen_write_msr,
.read_msr_safe = xen_read_msr_safe,
.write_msr_safe = xen_write_msr_safe,
.read_pmc = xen_read_pmc,
.iret = xen_iret,
#ifdef CONFIG_X86_64
.usergs_sysret64 = xen_sysret64,
#endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
.load_idt = xen_load_idt,
.load_tls = xen_load_tls,
#ifdef CONFIG_X86_64
.load_gs_index = xen_load_gs_index,
#endif
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
.store_tr = xen_store_tr,
.write_ldt_entry = xen_write_ldt_entry,
.write_gdt_entry = xen_write_gdt_entry,
.write_idt_entry = xen_write_idt_entry,
.load_sp0 = xen_load_sp0,
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
/* Xen takes care of %gs when switching to usermode for us */
.swapgs = paravirt_nop,
.start_context_switch = paravirt_start_context_switch,
.end_context_switch = xen_end_context_switch,
};
static void xen_restart(char *msg)
{
xen_reboot(SHUTDOWN_reboot);
}
static void xen_machine_halt(void)
{
xen_reboot(SHUTDOWN_poweroff);
}
static void xen_machine_power_off(void)
{
if (pm_power_off)
pm_power_off();
xen_reboot(SHUTDOWN_poweroff);
}
static void xen_crash_shutdown(struct pt_regs *regs)
{
xen_reboot(SHUTDOWN_crash);
}
static const struct machine_ops xen_machine_ops __initconst = {
.restart = xen_restart,
.halt = xen_machine_halt,
.power_off = xen_machine_power_off,
.shutdown = xen_machine_halt,
.crash_shutdown = xen_crash_shutdown,
.emergency_restart = xen_emergency_restart,
};
static unsigned char xen_get_nmi_reason(void)
{
unsigned char reason = 0;
/* Construct a value which looks like it came from port 0x61. */
if (test_bit(_XEN_NMIREASON_io_error,
&HYPERVISOR_shared_info->arch.nmi_reason))
reason |= NMI_REASON_IOCHK;
if (test_bit(_XEN_NMIREASON_pci_serr,
&HYPERVISOR_shared_info->arch.nmi_reason))
reason |= NMI_REASON_SERR;
return reason;
}
static void __init xen_boot_params_init_edd(void)
{
#if IS_ENABLED(CONFIG_EDD)
struct xen_platform_op op;
struct edd_info *edd_info;
u32 *mbr_signature;
unsigned nr;
int ret;
edd_info = boot_params.eddbuf;
mbr_signature = boot_params.edd_mbr_sig_buffer;
op.cmd = XENPF_firmware_info;
op.u.firmware_info.type = XEN_FW_DISK_INFO;
for (nr = 0; nr < EDDMAXNR; nr++) {
struct edd_info *info = edd_info + nr;
op.u.firmware_info.index = nr;
info->params.length = sizeof(info->params);
set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
&info->params);
ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
#define C(x) info->x = op.u.firmware_info.u.disk_info.x
C(device);
C(version);
C(interface_support);
C(legacy_max_cylinder);
C(legacy_max_head);
C(legacy_sectors_per_track);
#undef C
}
boot_params.eddbuf_entries = nr;
op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
op.u.firmware_info.index = nr;
ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
}
boot_params.edd_mbr_sig_buf_entries = nr;
#endif
}
/*
* Set up the GDT and segment registers for -fstack-protector. Until
* we do this, we have to be careful not to call any stack-protected
* function, which is most of the kernel.
*/
static void xen_setup_gdt(int cpu)
{
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
pv_cpu_ops.load_gdt = xen_load_gdt_boot;
setup_stack_canary_segment(0);
switch_to_new_gdt(0);
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
pv_cpu_ops.load_gdt = xen_load_gdt;
}
static void __init xen_dom0_set_legacy_features(void)
{
x86_platform.legacy.rtc = 1;
}
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
struct physdev_set_iopl set_iopl;
unsigned long initrd_start = 0;
int rc;
if (!xen_start_info)
return;
xen_domain_type = XEN_PV_DOMAIN;
xen_setup_features();
xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops.patch = paravirt_patch_default;
pv_cpu_ops = xen_cpu_ops;
x86_platform.get_nmi_reason = xen_get_nmi_reason;
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.irqs.intr_mode_init = x86_init_noop;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
/*
* Set up some pagetable state before starting to set any ptes.
*/
xen_init_mmu_ops();
/* Prevent unwanted bits from being set in PTEs. */
__supported_pte_mask &= ~_PAGE_GLOBAL;
/*
* Prevent page tables from being allocated in highmem, even
* if CONFIG_HIGHPTE is enabled.
*/
__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
/* Work out if we support NX */
x86_configure_nx();
/* Get mfn list */
xen_build_dynamic_phys_to_machine();
/*
* Set up kernel GDT and segment registers, mainly so that
* -fstack-protector code can be executed.
*/
xen_setup_gdt(0);
xen_init_irq_ops();
xen_init_capabilities();
#ifdef CONFIG_X86_LOCAL_APIC
/*
* set up the basic apic ops.
*/
xen_init_apic();
#endif
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
}
machine_ops = xen_machine_ops;
/*
* The only reliable way to retain the initial address of the
* percpu gdt_page is to remember it here, so we can go and
* mark it RW later, when the initial percpu area is freed.
*/
xen_initial_gdt = &per_cpu(gdt_page, 0);
xen_smp_init();
#ifdef CONFIG_ACPI_NUMA
/*
* The pages we from Xen are not related to machine pages, so
* any NUMA information the kernel tries to get from ACPI will
* be meaningless. Prevent it from trying.
*/
acpi_numa = -1;
#endif
/* Let's presume PV guests always boot on vCPU with id 0. */
per_cpu(xen_vcpu_id, 0) = 0;
/*
* Setup xen_vcpu early because start_kernel needs it for
* local_irq_disable(), irqs_disabled().
*
* Don't do the full vcpu_info placement stuff until we have
* the cpu_possible_mask and a non-dummy shared_info.
*/
xen_vcpu_info_reset(0);
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
local_irq_disable();
early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
xen_start_info->nr_pages);
xen_reserve_special_pages();
/* keep using Xen gdt for now; no urgent need to change it */
#ifdef CONFIG_X86_32
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
#else
pv_info.kernel_rpl = 0;
#endif
/* set the limit of our address space */
xen_reserve_top();
/*
* We used to do this in xen_arch_setup, but that is too late
* on AMD were early_cpu_init (run before ->arch_setup()) calls
* early_amd_init which pokes 0xcf8 port.
*/
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
xen_raw_printk("physdev_op failed %d\n", rc);
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
#endif
if (xen_start_info->mod_start) {
if (xen_start_info->flags & SIF_MOD_START_PFN)
initrd_start = PFN_PHYS(xen_start_info->mod_start);
else
initrd_start = __pa(xen_start_info->mod_start);
}
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
boot_params.hdr.ramdisk_image = initrd_start;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
if (!xen_initial_domain()) {
add_preferred_console("xenboot", 0, NULL);
add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
if (pci_xen)
x86_init.pci.arch_init = pci_xen_init;
} else {
const struct dom0_vga_console_info *info =
(void *)((char *)xen_start_info +
xen_start_info->console.dom0.info_off);
struct xen_platform_op op = {
.cmd = XENPF_firmware_info,
.interface_version = XENPF_INTERFACE_VERSION,
.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
};
x86_platform.set_legacy_features =
xen_dom0_set_legacy_features;
xen_init_vga(info, xen_start_info->console.dom0.info_size);
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
if (HYPERVISOR_platform_op(&op) == 0)
boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
/* Make sure ACS will be enabled */
pci_request_acs();
xen_acpi_sleep_register();
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_smp_config = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
xen_boot_params_init_edd();
}
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
pci_probe &= ~PCI_PROBE_BIOS;
#endif
xen_raw_console_write("about to get started...\n");
/* We need this for printk timestamps */
xen_setup_runstate_info(0);
xen_efi_init();
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
#else
cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
static int xen_cpu_up_prepare_pv(unsigned int cpu)
{
int rc;
if (per_cpu(xen_vcpu, cpu) == NULL)
return -ENODEV;
xen_setup_timer(cpu);
rc = xen_smp_intr_init(cpu);
if (rc) {
WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
cpu, rc);
return rc;
}
rc = xen_smp_intr_init_pv(cpu);
if (rc) {
WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
cpu, rc);
return rc;
}
return 0;
}
static int xen_cpu_dead_pv(unsigned int cpu)
{
xen_smp_intr_free(cpu);
xen_smp_intr_free_pv(cpu);
xen_teardown_timer(cpu);
return 0;
}
static uint32_t __init xen_platform_pv(void)
{
if (xen_pv_domain())
return xen_cpuid_base();
return 0;
}
const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
.name = "Xen PV",
.detect = xen_platform_pv,
.type = X86_HYPER_XEN_PV,
.runtime.pin_vcpu = xen_pin_vcpu,
};