- Add MTE support in guests, complete with tag save/restore interface
 
 - Reduce the impact of CMOs by moving them in the page-table code
 
 - Allow device block mappings at stage-2
 
 - Reduce the footprint of the vmemmap in protected mode
 
 - Support the vGIC on dumb systems such as the Apple M1
 
 - Add selftest infrastructure to support multiple configuration
   and apply that to PMU/non-PMU setups
 
 - Add selftests for the debug architecture
 
 - The usual crop of PMU fixes
 
 PPC:
 
 - Support for the H_RPT_INVALIDATE hypercall
 
 - Conversion of Book3S entry/exit to C
 
 - Bug fixes
 
 S390:
 
 - new HW facilities for guests
 
 - make inline assembly more robust with KASAN and co
 
 x86:
 
 - Allow userspace to handle emulation errors (unknown instructions)
 
 - Lazy allocation of the rmap (host physical -> guest physical address)
 
 - Support for virtualizing TSC scaling on VMX machines
 
 - Optimizations to avoid shattering huge pages at the beginning of live migration
 
 - Support for initializing the PDPTRs without loading them from memory
 
 - Many TLB flushing cleanups
 
 - Refuse to load if two-stage paging is available but NX is not (this has
   been a requirement in practice for over a year)
 
 - A large series that separates the MMU mode (WP/SMAP/SMEP etc.) from
   CR0/CR4/EFER, using the MMU mode everywhere once it is computed
   from the CPU registers
 
 - Use PM notifier to notify the guest about host suspend or hibernate
 
 - Support for passing arguments to Hyper-V hypercalls using XMM registers
 
 - Support for Hyper-V TLB flush hypercalls and enlightened MSR bitmap on
   AMD processors
 
 - Hide Hyper-V hypercalls that are not included in the guest CPUID
 
 - Fixes for live migration of virtual machines that use the Hyper-V
   "enlightened VMCS" optimization of nested virtualization
 
 - Bugfixes (not many)
 
 Generic:
 
 - Support for retrieving statistics without debugfs
 
 - Cleanups for the KVM selftests API
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmDV9UYUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroOIRgf/XX8fKLh24RnTOs2ldIu2AfRGVrT4
 QMrr8MxhmtukBAszk2xKvBt8/6gkUjdaIC3xqEnVjxaDaUvZaEtP7CQlF5JV45rn
 iv1zyxUKucXrnIOr+gCioIT7qBlh207zV35ArKioP9Y83cWx9uAs22pfr6g+7RxO
 h8bJZlJbSG6IGr3voANCIb9UyjU1V/l8iEHqRwhmr/A5rARPfD7g8lfMEQeGkzX6
 +/UydX2fumB3tl8e2iMQj6vLVdSOsCkehvpHK+Z33EpkKhan7GwZ2sZ05WmXV/nY
 QLAYfD10KegoNWl5Ay4GTp4hEAIYVrRJCLC+wnLdc0U8udbfCuTC31LK4w==
 =NcRh
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Paolo Bonzini:
 "This covers all architectures (except MIPS) so I don't expect any
  other feature pull requests this merge window.

  ARM:

   - Add MTE support in guests, complete with tag save/restore interface

   - Reduce the impact of CMOs by moving them in the page-table code

   - Allow device block mappings at stage-2

   - Reduce the footprint of the vmemmap in protected mode

   - Support the vGIC on dumb systems such as the Apple M1

   - Add selftest infrastructure to support multiple configuration and
     apply that to PMU/non-PMU setups

   - Add selftests for the debug architecture

   - The usual crop of PMU fixes

  PPC:

   - Support for the H_RPT_INVALIDATE hypercall

   - Conversion of Book3S entry/exit to C

   - Bug fixes

  S390:

   - new HW facilities for guests

   - make inline assembly more robust with KASAN and co

  x86:

   - Allow userspace to handle emulation errors (unknown instructions)

   - Lazy allocation of the rmap (host physical -> guest physical
     address)

   - Support for virtualizing TSC scaling on VMX machines

   - Optimizations to avoid shattering huge pages at the beginning of
     live migration

   - Support for initializing the PDPTRs without loading them from
     memory

   - Many TLB flushing cleanups

   - Refuse to load if two-stage paging is available but NX is not (this
     has been a requirement in practice for over a year)

   - A large series that separates the MMU mode (WP/SMAP/SMEP etc.) from
     CR0/CR4/EFER, using the MMU mode everywhere once it is computed
     from the CPU registers

   - Use PM notifier to notify the guest about host suspend or hibernate

   - Support for passing arguments to Hyper-V hypercalls using XMM
     registers

   - Support for Hyper-V TLB flush hypercalls and enlightened MSR bitmap
     on AMD processors

   - Hide Hyper-V hypercalls that are not included in the guest CPUID

   - Fixes for live migration of virtual machines that use the Hyper-V
     "enlightened VMCS" optimization of nested virtualization

   - Bugfixes (not many)

  Generic:

   - Support for retrieving statistics without debugfs

   - Cleanups for the KVM selftests API"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (314 commits)
  KVM: x86: rename apic_access_page_done to apic_access_memslot_enabled
  kvm: x86: disable the narrow guest module parameter on unload
  selftests: kvm: Allows userspace to handle emulation errors.
  kvm: x86: Allow userspace to handle emulation errors
  KVM: x86/mmu: Let guest use GBPAGES if supported in hardware and TDP is on
  KVM: x86/mmu: Get CR4.SMEP from MMU, not vCPU, in shadow page fault
  KVM: x86/mmu: Get CR0.WP from MMU, not vCPU, in shadow page fault
  KVM: x86/mmu: Drop redundant rsvd bits reset for nested NPT
  KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic
  KVM: x86: Enhance comments for MMU roles and nested transition trickiness
  KVM: x86/mmu: WARN on any reserved SPTE value when making a valid SPTE
  KVM: x86/mmu: Add helpers to do full reserved SPTE checks w/ generic MMU
  KVM: x86/mmu: Use MMU's role to determine PTTYPE
  KVM: x86/mmu: Collapse 32-bit PAE and 64-bit statements for helpers
  KVM: x86/mmu: Add a helper to calculate root from role_regs
  KVM: x86/mmu: Add helper to update paging metadata
  KVM: x86/mmu: Don't update nested guest's paging bitmasks if CR0.PG=0
  KVM: x86/mmu: Consolidate reset_rsvds_bits_mask() calls
  KVM: x86/mmu: Use MMU role_regs to get LA57, and drop vCPU LA57 helper
  KVM: x86/mmu: Get nested MMU's root level from the MMU's role
  ...
This commit is contained in:
Linus Torvalds 2021-06-28 15:40:51 -07:00
commit 36824f198c
211 changed files with 10501 additions and 4275 deletions

View File

@ -688,9 +688,14 @@ MSRs that have been set successfully.
Defines the vcpu responses to the cpuid instruction. Applications
should use the KVM_SET_CPUID2 ioctl if available.
Note, when this IOCTL fails, KVM gives no guarantees that previous valid CPUID
configuration (if there is) is not corrupted. Userspace can get a copy of the
resulting CPUID configuration through KVM_GET_CPUID2 in case.
Caveat emptor:
- If this IOCTL fails, KVM gives no guarantees that previous valid CPUID
configuration (if there is) is not corrupted. Userspace can get a copy
of the resulting CPUID configuration through KVM_GET_CPUID2 in case.
- Using KVM_SET_CPUID{,2} after KVM_RUN, i.e. changing the guest vCPU model
after running the guest, may cause guest instability.
- Using heterogeneous CPUID configurations, modulo APIC IDs, topology, etc...
may cause guest instability.
::
@ -5034,6 +5039,260 @@ see KVM_XEN_VCPU_SET_ATTR above.
The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used
with the KVM_XEN_VCPU_GET_ATTR ioctl.
4.130 KVM_ARM_MTE_COPY_TAGS
---------------------------
:Capability: KVM_CAP_ARM_MTE
:Architectures: arm64
:Type: vm ioctl
:Parameters: struct kvm_arm_copy_mte_tags
:Returns: number of bytes copied, < 0 on error (-EINVAL for incorrect
arguments, -EFAULT if memory cannot be accessed).
::
struct kvm_arm_copy_mte_tags {
__u64 guest_ipa;
__u64 length;
void __user *addr;
__u64 flags;
__u64 reserved[2];
};
Copies Memory Tagging Extension (MTE) tags to/from guest tag memory. The
``guest_ipa`` and ``length`` fields must be ``PAGE_SIZE`` aligned. The ``addr``
field must point to a buffer which the tags will be copied to or from.
``flags`` specifies the direction of copy, either ``KVM_ARM_TAGS_TO_GUEST`` or
``KVM_ARM_TAGS_FROM_GUEST``.
The size of the buffer to store the tags is ``(length / 16)`` bytes
(granules in MTE are 16 bytes long). Each byte contains a single tag
value. This matches the format of ``PTRACE_PEEKMTETAGS`` and
``PTRACE_POKEMTETAGS``.
If an error occurs before any data is copied then a negative error code is
returned. If some tags have been copied before an error occurs then the number
of bytes successfully copied is returned. If the call completes successfully
then ``length`` is returned.
4.131 KVM_GET_SREGS2
------------------
:Capability: KVM_CAP_SREGS2
:Architectures: x86
:Type: vcpu ioctl
:Parameters: struct kvm_sregs2 (out)
:Returns: 0 on success, -1 on error
Reads special registers from the vcpu.
This ioctl (when supported) replaces the KVM_GET_SREGS.
::
struct kvm_sregs2 {
/* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
struct kvm_segment cs, ds, es, fs, gs, ss;
struct kvm_segment tr, ldt;
struct kvm_dtable gdt, idt;
__u64 cr0, cr2, cr3, cr4, cr8;
__u64 efer;
__u64 apic_base;
__u64 flags;
__u64 pdptrs[4];
};
flags values for ``kvm_sregs2``:
``KVM_SREGS2_FLAGS_PDPTRS_VALID``
Indicates thats the struct contain valid PDPTR values.
4.132 KVM_SET_SREGS2
------------------
:Capability: KVM_CAP_SREGS2
:Architectures: x86
:Type: vcpu ioctl
:Parameters: struct kvm_sregs2 (in)
:Returns: 0 on success, -1 on error
Writes special registers into the vcpu.
See KVM_GET_SREGS2 for the data structures.
This ioctl (when supported) replaces the KVM_SET_SREGS.
4.133 KVM_GET_STATS_FD
----------------------
:Capability: KVM_CAP_STATS_BINARY_FD
:Architectures: all
:Type: vm ioctl, vcpu ioctl
:Parameters: none
:Returns: statistics file descriptor on success, < 0 on error
Errors:
====== ======================================================
ENOMEM if the fd could not be created due to lack of memory
EMFILE if the number of opened files exceeds the limit
====== ======================================================
The returned file descriptor can be used to read VM/vCPU statistics data in
binary format. The data in the file descriptor consists of four blocks
organized as follows:
+-------------+
| Header |
+-------------+
| id string |
+-------------+
| Descriptors |
+-------------+
| Stats Data |
+-------------+
Apart from the header starting at offset 0, please be aware that it is
not guaranteed that the four blocks are adjacent or in the above order;
the offsets of the id, descriptors and data blocks are found in the
header. However, all four blocks are aligned to 64 bit offsets in the
file and they do not overlap.
All blocks except the data block are immutable. Userspace can read them
only one time after retrieving the file descriptor, and then use ``pread`` or
``lseek`` to read the statistics repeatedly.
All data is in system endianness.
The format of the header is as follows::
struct kvm_stats_header {
__u32 flags;
__u32 name_size;
__u32 num_desc;
__u32 id_offset;
__u32 desc_offset;
__u32 data_offset;
};
The ``flags`` field is not used at the moment. It is always read as 0.
The ``name_size`` field is the size (in byte) of the statistics name string
(including trailing '\0') which is contained in the "id string" block and
appended at the end of every descriptor.
The ``num_desc`` field is the number of descriptors that are included in the
descriptor block. (The actual number of values in the data block may be
larger, since each descriptor may comprise more than one value).
The ``id_offset`` field is the offset of the id string from the start of the
file indicated by the file descriptor. It is a multiple of 8.
The ``desc_offset`` field is the offset of the Descriptors block from the start
of the file indicated by the file descriptor. It is a multiple of 8.
The ``data_offset`` field is the offset of the Stats Data block from the start
of the file indicated by the file descriptor. It is a multiple of 8.
The id string block contains a string which identifies the file descriptor on
which KVM_GET_STATS_FD was invoked. The size of the block, including the
trailing ``'\0'``, is indicated by the ``name_size`` field in the header.
The descriptors block is only needed to be read once for the lifetime of the
file descriptor contains a sequence of ``struct kvm_stats_desc``, each followed
by a string of size ``name_size``.
#define KVM_STATS_TYPE_SHIFT 0
#define KVM_STATS_TYPE_MASK (0xF << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT)
#define KVM_STATS_UNIT_SHIFT 4
#define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_NONE (0x0 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT)
#define KVM_STATS_BASE_SHIFT 8
#define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT)
#define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT)
#define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT)
struct kvm_stats_desc {
__u32 flags;
__s16 exponent;
__u16 size;
__u32 offset;
__u32 unused;
char name[];
};
The ``flags`` field contains the type and unit of the statistics data described
by this descriptor. Its endianness is CPU native.
The following flags are supported:
Bits 0-3 of ``flags`` encode the type:
* ``KVM_STATS_TYPE_CUMULATIVE``
The statistics data is cumulative. The value of data can only be increased.
Most of the counters used in KVM are of this type.
The corresponding ``size`` field for this type is always 1.
All cumulative statistics data are read/write.
* ``KVM_STATS_TYPE_INSTANT``
The statistics data is instantaneous. Its value can be increased or
decreased. This type is usually used as a measurement of some resources,
like the number of dirty pages, the number of large pages, etc.
All instant statistics are read only.
The corresponding ``size`` field for this type is always 1.
* ``KVM_STATS_TYPE_PEAK``
The statistics data is peak. The value of data can only be increased, and
represents a peak value for a measurement, for example the maximum number
of items in a hash table bucket, the longest time waited and so on.
The corresponding ``size`` field for this type is always 1.
Bits 4-7 of ``flags`` encode the unit:
* ``KVM_STATS_UNIT_NONE``
There is no unit for the value of statistics data. This usually means that
the value is a simple counter of an event.
* ``KVM_STATS_UNIT_BYTES``
It indicates that the statistics data is used to measure memory size, in the
unit of Byte, KiByte, MiByte, GiByte, etc. The unit of the data is
determined by the ``exponent`` field in the descriptor.
* ``KVM_STATS_UNIT_SECONDS``
It indicates that the statistics data is used to measure time or latency.
* ``KVM_STATS_UNIT_CYCLES``
It indicates that the statistics data is used to measure CPU clock cycles.
Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the
unit:
* ``KVM_STATS_BASE_POW10``
The scale is based on power of 10. It is used for measurement of time and
CPU clock cycles. For example, an exponent of -9 can be used with
``KVM_STATS_UNIT_SECONDS`` to express that the unit is nanoseconds.
* ``KVM_STATS_BASE_POW2``
The scale is based on power of 2. It is used for measurement of memory size.
For example, an exponent of 20 can be used with ``KVM_STATS_UNIT_BYTES`` to
express that the unit is MiB.
The ``size`` field is the number of values of this statistics data. Its
value is usually 1 for most of simple statistics. 1 means it contains an
unsigned 64bit data.
The ``offset`` field is the offset from the start of Data Block to the start of
the corresponding statistics data.
The ``unused`` field is reserved for future support for other types of
statistics data, like log/linear histogram. Its value is always 0 for the types
defined above.
The ``name`` field is the name string of the statistics data. The name string
starts at the end of ``struct kvm_stats_desc``. The maximum length including
the trailing ``'\0'``, is indicated by ``name_size`` in the header.
The Stats Data block contains an array of 64-bit values in the same order
as the descriptors in Descriptors block.
5. The kvm_run structure
========================
@ -6323,6 +6582,7 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
This capability can be used to check / enable 2nd DAWR feature provided
by POWER10 processor.
7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
-------------------------------------
@ -6362,6 +6622,66 @@ default.
See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
7.26 KVM_CAP_PPC_RPT_INVALIDATE
-------------------------------
:Capability: KVM_CAP_PPC_RPT_INVALIDATE
:Architectures: ppc
:Type: vm
This capability indicates that the kernel is capable of handling
H_RPT_INVALIDATE hcall.
In order to enable the use of H_RPT_INVALIDATE in the guest,
user space might have to advertise it for the guest. For example,
IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
present in the "ibm,hypertas-functions" device-tree property.
This capability is enabled for hypervisors on platforms like POWER9
that support radix MMU.
7.27 KVM_CAP_EXIT_ON_EMULATION_FAILURE
--------------------------------------
:Architectures: x86
:Parameters: args[0] whether the feature should be enabled or not
When this capability is enabled, an emulation failure will result in an exit
to userspace with KVM_INTERNAL_ERROR (except when the emulator was invoked
to handle a VMware backdoor instruction). Furthermore, KVM will now provide up
to 15 instruction bytes for any exit to userspace resulting from an emulation
failure. When these exits to userspace occur use the emulation_failure struct
instead of the internal struct. They both have the same layout, but the
emulation_failure struct matches the content better. It also explicitly
defines the 'flags' field which is used to describe the fields in the struct
that are valid (ie: if KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES is
set in the 'flags' field then both 'insn_size' and 'insn_bytes' have valid data
in them.)
7.28 KVM_CAP_ARM_MTE
--------------------
:Architectures: arm64
:Parameters: none
This capability indicates that KVM (and the hardware) supports exposing the
Memory Tagging Extensions (MTE) to the guest. It must also be enabled by the
VMM before creating any VCPUs to allow the guest access. Note that MTE is only
available to a guest running in AArch64 mode and enabling this capability will
cause attempts to create AArch32 VCPUs to fail.
When enabled the guest is able to access tags associated with any memory given
to the guest. KVM will ensure that the tags are maintained during swap or
hibernation of the host; however the VMM needs to manually save/restore the
tags as appropriate if the VM is migrated.
When this capability is enabled all memory in memslots must be mapped as
not-shareable (no MAP_SHARED), attempts to create a memslot with a
MAP_SHARED mmap will result in an -EINVAL return.
When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to
perform a bulk copy of tags to/from the guest.
8. Other capabilities.
======================
@ -6891,3 +7211,33 @@ This capability is always enabled.
This capability indicates that the KVM virtual PTP service is
supported in the host. A VMM can check whether the service is
available to the guest on migration.
8.33 KVM_CAP_HYPERV_ENFORCE_CPUID
-----------------------------
Architectures: x86
When enabled, KVM will disable emulated Hyper-V features provided to the
guest according to the bits Hyper-V CPUID feature leaves. Otherwise, all
currently implmented Hyper-V features are provided unconditionally when
Hyper-V identification is set in the HYPERV_CPUID_INTERFACE (0x40000001)
leaf.
8.34 KVM_CAP_EXIT_HYPERCALL
---------------------------
:Capability: KVM_CAP_EXIT_HYPERCALL
:Architectures: x86
:Type: vm
This capability, if enabled, will cause KVM to exit to userspace
with KVM_EXIT_HYPERCALL exit reason to process some hypercalls.
Calling KVM_CHECK_EXTENSION for this capability will return a bitmask
of hypercalls that can be configured to exit to userspace.
Right now, the only such hypercall is KVM_HC_MAP_GPA_RANGE.
The argument to KVM_ENABLE_CAP is also a bitmask, and must be a subset
of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace
the hypercalls whose corresponding bit is in the argument, and return
ENOSYS for the others.

View File

@ -96,6 +96,13 @@ KVM_FEATURE_MSI_EXT_DEST_ID 15 guest checks this feature bit
before using extended destination
ID bits in MSI address bits 11-5.
KVM_FEATURE_HC_MAP_GPA_RANGE 16 guest checks this feature bit before
using the map gpa range hypercall
to notify the page state change
KVM_FEATURE_MIGRATION_CONTROL 17 guest checks this feature bit before
using MSR_KVM_MIGRATION_CONTROL
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side
per-cpu warps are expected in
kvmclock

View File

@ -169,3 +169,24 @@ a0: destination APIC ID
:Usage example: When sending a call-function IPI-many to vCPUs, yield if
any of the IPI target vCPUs was preempted.
8. KVM_HC_MAP_GPA_RANGE
-------------------------
:Architecture: x86
:Status: active
:Purpose: Request KVM to map a GPA range with the specified attributes.
a0: the guest physical address of the start page
a1: the number of (4kb) pages (must be contiguous in GPA space)
a2: attributes
Where 'attributes' :
* bits 3:0 - preferred page size encoding 0 = 4kb, 1 = 2mb, 2 = 1gb, etc...
* bit 4 - plaintext = 0, encrypted = 1
* bits 63:5 - reserved (must be zero)
**Implementation note**: this hypercall is implemented in userspace via
the KVM_CAP_EXIT_HYPERCALL capability. Userspace must enable that capability
before advertising KVM_FEATURE_HC_MAP_GPA_RANGE in the guest CPUID. In
addition, if the guest supports KVM_FEATURE_MIGRATION_CONTROL, userspace
must also set up an MSR filter to process writes to MSR_KVM_MIGRATION_CONTROL.

View File

@ -16,6 +16,11 @@ The acquisition orders for mutexes are as follows:
- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
them together is quite rare.
- Unlike kvm->slots_lock, kvm->slots_arch_lock is released before
synchronize_srcu(&kvm->srcu). Therefore kvm->slots_arch_lock
can be taken inside a kvm->srcu read-side critical section,
while kvm->slots_lock cannot.
On x86:
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock

View File

@ -180,8 +180,8 @@ Shadow pages contain the following information:
role.gpte_is_8_bytes:
Reflects the size of the guest PTE for which the page is valid, i.e. '1'
if 64-bit gptes are in use, '0' if 32-bit gptes are in use.
role.nxe:
Contains the value of efer.nxe for which the page is valid.
role.efer_nx:
Contains the value of efer.nx for which the page is valid.
role.cr0_wp:
Contains the value of cr0.wp for which the page is valid.
role.smep_andnot_wp:
@ -192,9 +192,6 @@ Shadow pages contain the following information:
Contains the value of cr4.smap && !cr0.wp for which the page is valid
(pages for which this is true are different from other pages; see the
treatment of cr0.wp=0 below).
role.ept_sp:
This is a virtual flag to denote a shadowed nested EPT page. ept_sp
is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination.
role.smm:
Is 1 if the page is valid in system management mode. This field
determines which of the kvm_memslots array was used to build this

View File

@ -376,3 +376,16 @@ data:
write '1' to bit 0 of the MSR, this causes the host to re-scan its queue
and check if there are more notifications pending. The MSR is available
if KVM_FEATURE_ASYNC_PF_INT is present in CPUID.
MSR_KVM_MIGRATION_CONTROL:
0x4b564d08
data:
This MSR is available if KVM_FEATURE_MIGRATION_CONTROL is present in
CPUID. Bit 0 represents whether live migration of the guest is allowed.
When a guest is started, bit 0 will be 0 if the guest has encrypted
memory and 1 if the guest does not have encrypted memory. If the
guest is communicating page encryption status to the host using the
``KVM_HC_MAP_GPA_RANGE`` hypercall, it can set bit 0 in this MSR to
allow live migration of the guest.

View File

@ -9991,6 +9991,8 @@ F: arch/arm64/include/asm/kvm*
F: arch/arm64/include/uapi/asm/kvm*
F: arch/arm64/kvm/
F: include/kvm/arm_*
F: tools/testing/selftests/kvm/*/aarch64/
F: tools/testing/selftests/kvm/aarch64/
KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
M: Huacai Chen <chenhuacai@kernel.org>

View File

@ -12,7 +12,8 @@
#include <asm/types.h>
/* Hyp Configuration Register (HCR) bits */
#define HCR_ATA (UL(1) << 56)
#define HCR_ATA_SHIFT 56
#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
#define HCR_APK (UL(1) << 40)

View File

@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 |= HCR_TID2;
if (kvm_has_mte(vcpu->kvm))
vcpu->arch.hcr_el2 |= HCR_ATA;
}
static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)

View File

@ -46,6 +46,7 @@
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4)
#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
@ -132,6 +133,9 @@ struct kvm_arch {
u8 pfr0_csv2;
u8 pfr0_csv3;
/* Memory Tagging Extension enabled for the guest */
bool mte_enabled;
};
struct kvm_vcpu_fault_info {
@ -206,6 +210,12 @@ enum vcpu_sysreg {
CNTP_CVAL_EL0,
CNTP_CTL_EL0,
/* Memory Tagging Extension registers */
RGSR_EL1, /* Random Allocation Tag Seed Register */
GCR_EL1, /* Tag Control Register */
TFSR_EL1, /* Tag Fault Status Register (EL1) */
TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
/* 32bit specific registers. Keep them at the end of the range */
DACR32_EL2, /* Domain Access Control Register */
IFSR32_EL2, /* Instruction Fault Status Register */
@ -556,16 +566,11 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
}
struct kvm_vm_stat {
ulong remote_tlb_flush;
struct kvm_vm_stat_generic generic;
};
struct kvm_vcpu_stat {
u64 halt_successful_poll;
u64 halt_attempted_poll;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 halt_poll_invalid;
u64 halt_wakeup;
struct kvm_vcpu_stat_generic generic;
u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
@ -721,6 +726,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
struct kvm_arm_copy_mte_tags *copy_tags);
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
@ -769,6 +777,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
#define kvm_arm_vcpu_sve_finalized(vcpu) \
((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
#define kvm_vcpu_has_pmu(vcpu) \
(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))

View File

@ -188,10 +188,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
}
static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
static inline void __clean_dcache_guest_page(void *va, size_t size)
{
void *va = page_address(pfn_to_page(pfn));
/*
* With FWB, we ensure that the guest always accesses memory using
* cacheable attributes, and we don't have to clean to PoC when
@ -204,18 +202,14 @@ static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
kvm_flush_dcache_to_poc(va, size);
}
static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
unsigned long size)
static inline void __invalidate_icache_guest_page(void *va, size_t size)
{
if (icache_is_aliasing()) {
/* any kind of VIPT cache */
icache_inval_all_pou();
} else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
void *va = page_address(pfn_to_page(pfn));
icache_inval_pou((unsigned long)va,
(unsigned long)va + size);
icache_inval_pou((unsigned long)va, (unsigned long)va + size);
}
}

View File

@ -0,0 +1,66 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2020-2021 ARM Ltd.
*/
#ifndef __ASM_KVM_MTE_H
#define __ASM_KVM_MTE_H
#ifdef __ASSEMBLY__
#include <asm/sysreg.h>
#ifdef CONFIG_ARM64_MTE
.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
alternative_if_not ARM64_MTE
b .L__skip_switch\@
alternative_else_nop_endif
mrs \reg1, hcr_el2
tbz \reg1, #(HCR_ATA_SHIFT), .L__skip_switch\@
mrs_s \reg1, SYS_RGSR_EL1
str \reg1, [\h_ctxt, #CPU_RGSR_EL1]
mrs_s \reg1, SYS_GCR_EL1
str \reg1, [\h_ctxt, #CPU_GCR_EL1]
ldr \reg1, [\g_ctxt, #CPU_RGSR_EL1]
msr_s SYS_RGSR_EL1, \reg1
ldr \reg1, [\g_ctxt, #CPU_GCR_EL1]
msr_s SYS_GCR_EL1, \reg1
.L__skip_switch\@:
.endm
.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
alternative_if_not ARM64_MTE
b .L__skip_switch\@
alternative_else_nop_endif
mrs \reg1, hcr_el2
tbz \reg1, #(HCR_ATA_SHIFT), .L__skip_switch\@
mrs_s \reg1, SYS_RGSR_EL1
str \reg1, [\g_ctxt, #CPU_RGSR_EL1]
mrs_s \reg1, SYS_GCR_EL1
str \reg1, [\g_ctxt, #CPU_GCR_EL1]
ldr \reg1, [\h_ctxt, #CPU_RGSR_EL1]
msr_s SYS_RGSR_EL1, \reg1
ldr \reg1, [\h_ctxt, #CPU_GCR_EL1]
msr_s SYS_GCR_EL1, \reg1
isb
.L__skip_switch\@:
.endm
#else /* !CONFIG_ARM64_MTE */
.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
.endm
.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
.endm
#endif /* CONFIG_ARM64_MTE */
#endif /* __ASSEMBLY__ */
#endif /* __ASM_KVM_MTE_H */

View File

@ -27,23 +27,29 @@ typedef u64 kvm_pte_t;
/**
* struct kvm_pgtable_mm_ops - Memory management callbacks.
* @zalloc_page: Allocate a single zeroed memory page. The @arg parameter
* can be used by the walker to pass a memcache. The
* initial refcount of the page is 1.
* @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. The
* @size parameter is in bytes, and is rounded-up to the
* next page boundary. The resulting allocation is
* physically contiguous.
* @free_pages_exact: Free an exact number of memory pages previously
* allocated by zalloc_pages_exact.
* @get_page: Increment the refcount on a page.
* @put_page: Decrement the refcount on a page. When the refcount
* reaches 0 the page is automatically freed.
* @page_count: Return the refcount of a page.
* @phys_to_virt: Convert a physical address into a virtual address mapped
* in the current context.
* @virt_to_phys: Convert a virtual address mapped in the current context
* into a physical address.
* @zalloc_page: Allocate a single zeroed memory page.
* The @arg parameter can be used by the walker
* to pass a memcache. The initial refcount of
* the page is 1.
* @zalloc_pages_exact: Allocate an exact number of zeroed memory pages.
* The @size parameter is in bytes, and is rounded
* up to the next page boundary. The resulting
* allocation is physically contiguous.
* @free_pages_exact: Free an exact number of memory pages previously
* allocated by zalloc_pages_exact.
* @get_page: Increment the refcount on a page.
* @put_page: Decrement the refcount on a page. When the
* refcount reaches 0 the page is automatically
* freed.
* @page_count: Return the refcount of a page.
* @phys_to_virt: Convert a physical address into a virtual
* address mapped in the current context.
* @virt_to_phys: Convert a virtual address mapped in the current
* context into a physical address.
* @dcache_clean_inval_poc: Clean and invalidate the data cache to the PoC
* for the specified memory address range.
* @icache_inval_pou: Invalidate the instruction cache to the PoU
* for the specified memory address range.
*/
struct kvm_pgtable_mm_ops {
void* (*zalloc_page)(void *arg);
@ -54,6 +60,8 @@ struct kvm_pgtable_mm_ops {
int (*page_count)(void *addr);
void* (*phys_to_virt)(phys_addr_t phys);
phys_addr_t (*virt_to_phys)(void *addr);
void (*dcache_clean_inval_poc)(void *addr, size_t size);
void (*icache_inval_pou)(void *addr, size_t size);
};
/**

View File

@ -7,6 +7,7 @@
#define MTE_GRANULE_SIZE UL(16)
#define MTE_GRANULE_MASK (~(MTE_GRANULE_SIZE - 1))
#define MTE_GRANULES_PER_PAGE (PAGE_SIZE / MTE_GRANULE_SIZE)
#define MTE_TAG_SHIFT 56
#define MTE_TAG_SIZE 4
#define MTE_TAG_MASK GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)

View File

@ -38,7 +38,7 @@ void mte_free_tag_storage(char *storage);
#define PG_mte_tagged PG_arch_2
void mte_zero_clear_page_tags(void *addr);
void mte_sync_tags(pte_t *ptep, pte_t pte);
void mte_sync_tags(pte_t old_pte, pte_t pte);
void mte_copy_page_tags(void *kto, const void *kfrom);
void mte_thread_init_user(void);
void mte_thread_switch(struct task_struct *next);
@ -57,7 +57,7 @@ int mte_ptrace_copy_tags(struct task_struct *child, long request,
static inline void mte_zero_clear_page_tags(void *addr)
{
}
static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
static inline void mte_sync_tags(pte_t old_pte, pte_t pte)
{
}
static inline void mte_copy_page_tags(void *kto, const void *kfrom)

View File

@ -314,9 +314,25 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
__sync_icache_dcache(pte);
if (system_supports_mte() &&
pte_present(pte) && pte_tagged(pte) && !pte_special(pte))
mte_sync_tags(ptep, pte);
/*
* If the PTE would provide user space access to the tags associated
* with it then ensure that the MTE tags are synchronised. Although
* pte_access_permitted() returns false for exec only mappings, they
* don't expose tags (instruction fetches don't check tags).
*/
if (system_supports_mte() && pte_access_permitted(pte, false) &&
!pte_special(pte)) {
pte_t old_pte = READ_ONCE(*ptep);
/*
* We only need to synchronise if the new PTE has tags enabled
* or if swapping in (in which case another mapping may have
* set tags in the past even if this PTE isn't tagged).
* (!pte_none() && !pte_present()) is an open coded version of
* is_swap_pte()
*/
if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte)))
mte_sync_tags(old_pte, pte);
}
__check_racy_pte_update(mm, ptep, pte);

View File

@ -651,7 +651,8 @@
#define INIT_SCTLR_EL2_MMU_ON \
(SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \
SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | \
SCTLR_ELx_ITFSB | SCTLR_EL2_RES1)
#define INIT_SCTLR_EL2_MMU_OFF \
(SCTLR_EL2_RES1 | ENDIAN_SET_EL2)

View File

@ -184,6 +184,17 @@ struct kvm_vcpu_events {
__u32 reserved[12];
};
struct kvm_arm_copy_mte_tags {
__u64 guest_ipa;
__u64 length;
void __user *addr;
__u64 flags;
__u64 reserved[2];
};
#define KVM_ARM_TAGS_TO_GUEST 0
#define KVM_ARM_TAGS_FROM_GUEST 1
/* If you need to interpret the index values, here is the key: */
#define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000
#define KVM_REG_ARM_COPROC_SHIFT 16

View File

@ -113,6 +113,8 @@ int main(void)
DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags));
DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs));
DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1]));
DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1]));
DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1]));
DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1]));
DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1]));

View File

@ -32,10 +32,9 @@ DEFINE_STATIC_KEY_FALSE(mte_async_mode);
EXPORT_SYMBOL_GPL(mte_async_mode);
#endif
static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
static void mte_sync_page_tags(struct page *page, pte_t old_pte,
bool check_swap, bool pte_is_tagged)
{
pte_t old_pte = READ_ONCE(*ptep);
if (check_swap && is_swap_pte(old_pte)) {
swp_entry_t entry = pte_to_swp_entry(old_pte);
@ -43,6 +42,9 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
return;
}
if (!pte_is_tagged)
return;
page_kasan_tag_reset(page);
/*
* We need smp_wmb() in between setting the flags and clearing the
@ -55,16 +57,22 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
mte_clear_page_tags(page_address(page));
}
void mte_sync_tags(pte_t *ptep, pte_t pte)
void mte_sync_tags(pte_t old_pte, pte_t pte)
{
struct page *page = pte_page(pte);
long i, nr_pages = compound_nr(page);
bool check_swap = nr_pages == 1;
bool pte_is_tagged = pte_tagged(pte);
/* Early out if there's nothing to do */
if (!check_swap && !pte_is_tagged)
return;
/* if PG_mte_tagged is set, tags have already been initialised */
for (i = 0; i < nr_pages; i++, page++) {
if (!test_and_set_bit(PG_mte_tagged, &page->flags))
mte_sync_page_tags(page, ptep, check_swap);
mte_sync_page_tags(page, old_pte, check_swap,
pte_is_tagged);
}
}

View File

@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
$(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \

View File

@ -9,6 +9,7 @@
#include <linux/kvm_host.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/uaccess.h>
#include <clocksource/arm_arch_timer.h>
@ -973,6 +974,135 @@ static int kvm_timer_dying_cpu(unsigned int cpu)
return 0;
}
static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
{
if (vcpu)
irqd_set_forwarded_to_vcpu(d);
else
irqd_clr_forwarded_to_vcpu(d);
return 0;
}
static int timer_irq_set_irqchip_state(struct irq_data *d,
enum irqchip_irq_state which, bool val)
{
if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d))
return irq_chip_set_parent_state(d, which, val);
if (val)
irq_chip_mask_parent(d);
else
irq_chip_unmask_parent(d);
return 0;
}
static void timer_irq_eoi(struct irq_data *d)
{
if (!irqd_is_forwarded_to_vcpu(d))
irq_chip_eoi_parent(d);
}
static void timer_irq_ack(struct irq_data *d)
{
d = d->parent_data;
if (d->chip->irq_ack)
d->chip->irq_ack(d);
}
static struct irq_chip timer_chip = {
.name = "KVM",
.irq_ack = timer_irq_ack,
.irq_mask = irq_chip_mask_parent,
.irq_unmask = irq_chip_unmask_parent,
.irq_eoi = timer_irq_eoi,
.irq_set_type = irq_chip_set_type_parent,
.irq_set_vcpu_affinity = timer_irq_set_vcpu_affinity,
.irq_set_irqchip_state = timer_irq_set_irqchip_state,
};
static int timer_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs, void *arg)
{
irq_hw_number_t hwirq = (uintptr_t)arg;
return irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
&timer_chip, NULL);
}
static void timer_irq_domain_free(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs)
{
}
static const struct irq_domain_ops timer_domain_ops = {
.alloc = timer_irq_domain_alloc,
.free = timer_irq_domain_free,
};
static struct irq_ops arch_timer_irq_ops = {
.get_input_level = kvm_arch_timer_get_input_level,
};
static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
{
*flags = irq_get_trigger_type(virq);
if (*flags != IRQF_TRIGGER_HIGH && *flags != IRQF_TRIGGER_LOW) {
kvm_err("Invalid trigger for timer IRQ%d, assuming level low\n",
virq);
*flags = IRQF_TRIGGER_LOW;
}
}
static int kvm_irq_init(struct arch_timer_kvm_info *info)
{
struct irq_domain *domain = NULL;
if (info->virtual_irq <= 0) {
kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
info->virtual_irq);
return -ENODEV;
}
host_vtimer_irq = info->virtual_irq;
kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags);
if (kvm_vgic_global_state.no_hw_deactivation) {
struct fwnode_handle *fwnode;
struct irq_data *data;
fwnode = irq_domain_alloc_named_fwnode("kvm-timer");
if (!fwnode)
return -ENOMEM;
/* Assume both vtimer and ptimer in the same parent */
data = irq_get_irq_data(host_vtimer_irq);
domain = irq_domain_create_hierarchy(data->domain, 0,
NR_KVM_TIMERS, fwnode,
&timer_domain_ops, NULL);
if (!domain) {
irq_domain_free_fwnode(fwnode);
return -ENOMEM;
}
arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE;
WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq,
(void *)TIMER_VTIMER));
}
if (info->physical_irq > 0) {
host_ptimer_irq = info->physical_irq;
kvm_irq_fixup_flags(host_ptimer_irq, &host_ptimer_irq_flags);
if (domain)
WARN_ON(irq_domain_push_irq(domain, host_ptimer_irq,
(void *)TIMER_PTIMER));
}
return 0;
}
int kvm_timer_hyp_init(bool has_gic)
{
struct arch_timer_kvm_info *info;
@ -986,23 +1116,12 @@ int kvm_timer_hyp_init(bool has_gic)
return -ENODEV;
}
err = kvm_irq_init(info);
if (err)
return err;
/* First, do the virtual EL1 timer irq */
if (info->virtual_irq <= 0) {
kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
info->virtual_irq);
return -ENODEV;
}
host_vtimer_irq = info->virtual_irq;
host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
host_vtimer_irq);
host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
}
err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
"kvm guest vtimer", kvm_get_running_vcpus());
if (err) {
@ -1027,15 +1146,6 @@ int kvm_timer_hyp_init(bool has_gic)
/* Now let's do the physical EL1 timer irq */
if (info->physical_irq > 0) {
host_ptimer_irq = info->physical_irq;
host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
host_ptimer_irq);
host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
}
err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
"kvm guest ptimer", kvm_get_running_vcpus());
if (err) {
@ -1143,7 +1253,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_vtimer->host_timer_irq,
map.direct_vtimer->irq.irq,
kvm_arch_timer_get_input_level);
&arch_timer_irq_ops);
if (ret)
return ret;
@ -1151,7 +1261,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_ptimer->host_timer_irq,
map.direct_ptimer->irq.irq,
kvm_arch_timer_get_input_level);
&arch_timer_irq_ops);
}
if (ret)

View File

@ -93,6 +93,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
kvm->arch.return_nisv_io_abort_to_user = true;
break;
case KVM_CAP_ARM_MTE:
if (!system_supports_mte() || kvm->created_vcpus)
return -EINVAL;
r = 0;
kvm->arch.mte_enabled = true;
break;
default:
r = -EINVAL;
break;
@ -237,6 +243,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = 1;
break;
case KVM_CAP_ARM_MTE:
r = system_supports_mte();
break;
case KVM_CAP_STEAL_TIME:
r = kvm_arm_pvtime_supported();
break;
@ -689,6 +698,10 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
vgic_v4_load(vcpu);
preempt_enable();
}
if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
kvm_pmu_handle_pmcr(vcpu,
__vcpu_sys_reg(vcpu, PMCR_EL0));
}
}
@ -1359,6 +1372,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
return 0;
}
case KVM_ARM_MTE_COPY_TAGS: {
struct kvm_arm_copy_mte_tags copy_tags;
if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
return -EFAULT;
return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
}
default:
return -EINVAL;
}

View File

@ -28,20 +28,40 @@
#include "trace.h"
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
VCPU_STAT("mmio_exit_user", mmio_exit_user),
VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
VCPU_STAT("exits", exits),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
{ NULL }
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS()
};
static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vm_stats_desc),
};
const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, hvc_exit_stat),
STATS_DESC_COUNTER(VCPU, wfe_exit_stat),
STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
STATS_DESC_COUNTER(VCPU, mmio_exit_user),
STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
STATS_DESC_COUNTER(VCPU, exits)
};
static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vcpu_stats_desc),
};
static bool core_reg_offset_is_vreg(u64 off)
@ -995,3 +1015,89 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
return ret;
}
long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
struct kvm_arm_copy_mte_tags *copy_tags)
{
gpa_t guest_ipa = copy_tags->guest_ipa;
size_t length = copy_tags->length;
void __user *tags = copy_tags->addr;
gpa_t gfn;
bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
int ret = 0;
if (!kvm_has_mte(kvm))
return -EINVAL;
if (copy_tags->reserved[0] || copy_tags->reserved[1])
return -EINVAL;
if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
return -EINVAL;
if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
return -EINVAL;
gfn = gpa_to_gfn(guest_ipa);
mutex_lock(&kvm->slots_lock);
while (length > 0) {
kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
void *maddr;
unsigned long num_tags;
struct page *page;
if (is_error_noslot_pfn(pfn)) {
ret = -EFAULT;
goto out;
}
page = pfn_to_online_page(pfn);
if (!page) {
/* Reject ZONE_DEVICE memory */
ret = -EFAULT;
goto out;
}
maddr = page_address(page);
if (!write) {
if (test_bit(PG_mte_tagged, &page->flags))
num_tags = mte_copy_tags_to_user(tags, maddr,
MTE_GRANULES_PER_PAGE);
else
/* No tags in memory, so write zeros */
num_tags = MTE_GRANULES_PER_PAGE -
clear_user(tags, MTE_GRANULES_PER_PAGE);
kvm_release_pfn_clean(pfn);
} else {
num_tags = mte_copy_tags_from_user(maddr, tags,
MTE_GRANULES_PER_PAGE);
/*
* Set the flag after checking the write
* completed fully
*/
if (num_tags == MTE_GRANULES_PER_PAGE)
set_bit(PG_mte_tagged, &page->flags);
kvm_release_pfn_dirty(pfn);
}
if (num_tags != MTE_GRANULES_PER_PAGE) {
ret = -EFAULT;
goto out;
}
gfn++;
tags += num_tags;
length -= PAGE_SIZE;
}
out:
mutex_unlock(&kvm->slots_lock);
/* If some data has been copied report the number of bytes copied */
if (length != copy_tags->length)
return copy_tags->length - length;
return ret;
}

View File

@ -13,6 +13,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_mte.h>
#include <asm/kvm_ptrauth.h>
.text
@ -51,6 +52,9 @@ alternative_else_nop_endif
add x29, x0, #VCPU_CONTEXT
// mte_switch_to_guest(g_ctxt, h_ctxt, tmp1)
mte_switch_to_guest x29, x1, x2
// Macro ptrauth_switch_to_guest format:
// ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3)
// The below macro to restore guest keys is not implemented in C code
@ -142,6 +146,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// when this feature is enabled for kernel code.
ptrauth_switch_to_hyp x1, x2, x3, x4, x5
// mte_switch_to_hyp(g_ctxt, h_ctxt, reg1)
mte_switch_to_hyp x1, x2, x3
// Restore hyp's sp_el0
restore_sp_el0 x2, x3

View File

@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
new |= (old & PSR_C_BIT);
new |= (old & PSR_V_BIT);
// TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
if (kvm_has_mte(vcpu->kvm))
new |= PSR_TCO_BIT;
new |= (old & PSR_DIT_BIT);

View File

@ -76,6 +76,7 @@ el1_trap:
b __guest_exit
el1_irq:
el1_fiq:
get_vcpu_ptr x1, x0
mov x0, #ARM_EXCEPTION_IRQ
b __guest_exit
@ -131,7 +132,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_error_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
invalid_vector el1_fiq_invalid
.ltorg
@ -179,12 +179,12 @@ SYM_CODE_START(__kvm_hyp_vector)
valid_vect el1_sync // Synchronous 64-bit EL1
valid_vect el1_irq // IRQ 64-bit EL1
invalid_vect el1_fiq_invalid // FIQ 64-bit EL1
valid_vect el1_fiq // FIQ 64-bit EL1
valid_vect el1_error // Error 64-bit EL1
valid_vect el1_sync // Synchronous 32-bit EL1
valid_vect el1_irq // IRQ 32-bit EL1
invalid_vect el1_fiq_invalid // FIQ 32-bit EL1
valid_vect el1_fiq // FIQ 32-bit EL1
valid_vect el1_error // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_vector)

View File

@ -14,6 +14,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
@ -26,6 +27,16 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0);
}
static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
{
struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
if (!vcpu)
vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
return kvm_has_mte(kern_hyp_va(vcpu->kvm));
}
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
ctxt_sys_reg(ctxt, CSSELR_EL1) = read_sysreg(csselr_el1);
@ -46,6 +57,11 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par();
ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1);
if (ctxt_has_mte(ctxt)) {
ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1);
}
ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR);
@ -107,6 +123,11 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1);
write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1);
if (ctxt_has_mte(ctxt)) {
write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
}
if (!has_vhe() &&
cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) &&
ctxt->__hyp_running_vcpu) {

View File

@ -7,7 +7,7 @@
#include <nvhe/memory.h>
#include <nvhe/spinlock.h>
#define HYP_NO_ORDER UINT_MAX
#define HYP_NO_ORDER USHRT_MAX
struct hyp_pool {
/*
@ -19,48 +19,13 @@ struct hyp_pool {
struct list_head free_area[MAX_ORDER];
phys_addr_t range_start;
phys_addr_t range_end;
unsigned int max_order;
unsigned short max_order;
};
static inline void hyp_page_ref_inc(struct hyp_page *p)
{
struct hyp_pool *pool = hyp_page_to_pool(p);
hyp_spin_lock(&pool->lock);
p->refcount++;
hyp_spin_unlock(&pool->lock);
}
static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
{
struct hyp_pool *pool = hyp_page_to_pool(p);
int ret;
hyp_spin_lock(&pool->lock);
p->refcount--;
ret = (p->refcount == 0);
hyp_spin_unlock(&pool->lock);
return ret;
}
static inline void hyp_set_page_refcounted(struct hyp_page *p)
{
struct hyp_pool *pool = hyp_page_to_pool(p);
hyp_spin_lock(&pool->lock);
if (p->refcount) {
hyp_spin_unlock(&pool->lock);
BUG();
}
p->refcount = 1;
hyp_spin_unlock(&pool->lock);
}
/* Allocation */
void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
void hyp_get_page(void *addr);
void hyp_put_page(void *addr);
void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
void hyp_get_page(struct hyp_pool *pool, void *addr);
void hyp_put_page(struct hyp_pool *pool, void *addr);
/* Used pages cannot be freed */
int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,

View File

@ -23,7 +23,7 @@ extern struct host_kvm host_kvm;
int __pkvm_prot_finalize(void);
int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
int kvm_host_prepare_stage2(void *pgt_pool_base);
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
static __always_inline void __load_host_stage2(void)

View File

@ -7,12 +7,9 @@
#include <linux/types.h>
struct hyp_pool;
struct hyp_page {
unsigned int refcount;
unsigned int order;
struct hyp_pool *pool;
struct list_head node;
unsigned short refcount;
unsigned short order;
};
extern u64 __hyp_vmemmap;

View File

@ -78,19 +78,20 @@ static inline unsigned long hyp_s1_pgtable_pages(void)
return res;
}
static inline unsigned long host_s2_mem_pgtable_pages(void)
static inline unsigned long host_s2_pgtable_pages(void)
{
unsigned long res;
/*
* Include an extra 16 pages to safely upper-bound the worst case of
* concatenated pgds.
*/
return __hyp_pgtable_total_pages() + 16;
}
res = __hyp_pgtable_total_pages() + 16;
static inline unsigned long host_s2_dev_pgtable_pages(void)
{
/* Allow 1 GiB for MMIO mappings */
return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
return res;
}
#endif /* __KVM_HYP_MM_H */

View File

@ -23,8 +23,7 @@
extern unsigned long hyp_nr_cpus;
struct host_kvm host_kvm;
static struct hyp_pool host_s2_mem;
static struct hyp_pool host_s2_dev;
static struct hyp_pool host_s2_pool;
/*
* Copies of the host's CPU features registers holding sanitized values.
@ -36,7 +35,7 @@ static const u8 pkvm_hyp_id = 1;
static void *host_s2_zalloc_pages_exact(size_t size)
{
return hyp_alloc_pages(&host_s2_mem, get_order(size));
return hyp_alloc_pages(&host_s2_pool, get_order(size));
}
static void *host_s2_zalloc_page(void *pool)
@ -44,20 +43,24 @@ static void *host_s2_zalloc_page(void *pool)
return hyp_alloc_pages(pool, 0);
}
static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
static void host_s2_get_page(void *addr)
{
hyp_get_page(&host_s2_pool, addr);
}
static void host_s2_put_page(void *addr)
{
hyp_put_page(&host_s2_pool, addr);
}
static int prepare_s2_pool(void *pgt_pool_base)
{
unsigned long nr_pages, pfn;
int ret;
pfn = hyp_virt_to_pfn(mem_pgt_pool);
nr_pages = host_s2_mem_pgtable_pages();
ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
if (ret)
return ret;
pfn = hyp_virt_to_pfn(dev_pgt_pool);
nr_pages = host_s2_dev_pgtable_pages();
ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
pfn = hyp_virt_to_pfn(pgt_pool_base);
nr_pages = host_s2_pgtable_pages();
ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0);
if (ret)
return ret;
@ -67,8 +70,8 @@ static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
.get_page = hyp_get_page,
.put_page = hyp_put_page,
.get_page = host_s2_get_page,
.put_page = host_s2_put_page,
};
return 0;
@ -86,7 +89,7 @@ static void prepare_host_vtcr(void)
id_aa64mmfr1_el1_sys_val, phys_shift);
}
int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
int kvm_host_prepare_stage2(void *pgt_pool_base)
{
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
int ret;
@ -94,7 +97,7 @@ int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
prepare_host_vtcr();
hyp_spin_lock_init(&host_kvm.lock);
ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
ret = prepare_s2_pool(pgt_pool_base);
if (ret)
return ret;
@ -199,11 +202,10 @@ static bool range_is_memory(u64 start, u64 end)
}
static inline int __host_stage2_idmap(u64 start, u64 end,
enum kvm_pgtable_prot prot,
struct hyp_pool *pool)
enum kvm_pgtable_prot prot)
{
return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
prot, pool);
prot, &host_s2_pool);
}
static int host_stage2_idmap(u64 addr)
@ -211,7 +213,6 @@ static int host_stage2_idmap(u64 addr)
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
struct kvm_mem_range range;
bool is_memory = find_mem_range(addr, &range);
struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
int ret;
if (is_memory)
@ -222,22 +223,21 @@ static int host_stage2_idmap(u64 addr)
if (ret)
goto unlock;
ret = __host_stage2_idmap(range.start, range.end, prot, pool);
if (is_memory || ret != -ENOMEM)
ret = __host_stage2_idmap(range.start, range.end, prot);
if (ret != -ENOMEM)
goto unlock;
/*
* host_s2_mem has been provided with enough pages to cover all of
* memory with page granularity, so we should never hit the ENOMEM case.
* However, it is difficult to know how much of the MMIO range we will
* need to cover upfront, so we may need to 'recycle' the pages if we
* run out.
* The pool has been provided with enough pages to cover all of memory
* with page granularity, but it is difficult to know how much of the
* MMIO range we will need to cover upfront, so we may need to 'recycle'
* the pages if we run out.
*/
ret = host_stage2_unmap_dev_all();
if (ret)
goto unlock;
ret = __host_stage2_idmap(range.start, range.end, prot, pool);
ret = __host_stage2_idmap(range.start, range.end, prot);
unlock:
hyp_spin_unlock(&host_kvm.lock);
@ -258,7 +258,7 @@ int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
hyp_spin_lock(&host_kvm.lock);
ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
&host_s2_mem, pkvm_hyp_id);
&host_s2_pool, pkvm_hyp_id);
hyp_spin_unlock(&host_kvm.lock);
return ret != -EAGAIN ? ret : 0;

View File

@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
*/
static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
struct hyp_page *p,
unsigned int order)
unsigned short order)
{
phys_addr_t addr = hyp_page_to_phys(p);
@ -51,21 +51,49 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
/* Find a buddy page currently available for allocation */
static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
struct hyp_page *p,
unsigned int order)
unsigned short order)
{
struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
if (!buddy || buddy->order != order || list_empty(&buddy->node))
if (!buddy || buddy->order != order || buddy->refcount)
return NULL;
return buddy;
}
/*
* Pages that are available for allocation are tracked in free-lists, so we use
* the pages themselves to store the list nodes to avoid wasting space. As the
* allocator always returns zeroed pages (which are zeroed on the hyp_put_page()
* path to optimize allocation speed), we also need to clean-up the list node in
* each page when we take it out of the list.
*/
static inline void page_remove_from_list(struct hyp_page *p)
{
struct list_head *node = hyp_page_to_virt(p);
__list_del_entry(node);
memset(node, 0, sizeof(*node));
}
static inline void page_add_to_list(struct hyp_page *p, struct list_head *head)
{
struct list_head *node = hyp_page_to_virt(p);
INIT_LIST_HEAD(node);
list_add_tail(node, head);
}
static inline struct hyp_page *node_to_page(struct list_head *node)
{
return hyp_virt_to_page(node);
}
static void __hyp_attach_page(struct hyp_pool *pool,
struct hyp_page *p)
{
unsigned int order = p->order;
unsigned short order = p->order;
struct hyp_page *buddy;
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
@ -83,32 +111,23 @@ static void __hyp_attach_page(struct hyp_pool *pool,
break;
/* Take the buddy out of its list, and coallesce with @p */
list_del_init(&buddy->node);
page_remove_from_list(buddy);
buddy->order = HYP_NO_ORDER;
p = min(p, buddy);
}
/* Mark the new head, and insert it */
p->order = order;
list_add_tail(&p->node, &pool->free_area[order]);
}
static void hyp_attach_page(struct hyp_page *p)
{
struct hyp_pool *pool = hyp_page_to_pool(p);
hyp_spin_lock(&pool->lock);
__hyp_attach_page(pool, p);
hyp_spin_unlock(&pool->lock);
page_add_to_list(p, &pool->free_area[order]);
}
static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
struct hyp_page *p,
unsigned int order)
unsigned short order)
{
struct hyp_page *buddy;
list_del_init(&p->node);
page_remove_from_list(p);
while (p->order > order) {
/*
* The buddy of order n - 1 currently has HYP_NO_ORDER as it
@ -119,30 +138,64 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
p->order--;
buddy = __find_buddy_nocheck(pool, p, p->order);
buddy->order = p->order;
list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
page_add_to_list(buddy, &pool->free_area[buddy->order]);
}
return p;
}
void hyp_put_page(void *addr)
static inline void hyp_page_ref_inc(struct hyp_page *p)
{
struct hyp_page *p = hyp_virt_to_page(addr);
BUG_ON(p->refcount == USHRT_MAX);
p->refcount++;
}
static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
{
p->refcount--;
return (p->refcount == 0);
}
static inline void hyp_set_page_refcounted(struct hyp_page *p)
{
BUG_ON(p->refcount);
p->refcount = 1;
}
static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
{
if (hyp_page_ref_dec_and_test(p))
hyp_attach_page(p);
__hyp_attach_page(pool, p);
}
void hyp_get_page(void *addr)
/*
* Changes to the buddy tree and page refcounts must be done with the hyp_pool
* lock held. If a refcount change requires an update to the buddy tree (e.g.
* hyp_put_page()), both operations must be done within the same critical
* section to guarantee transient states (e.g. a page with null refcount but
* not yet attached to a free list) can't be observed by well-behaved readers.
*/
void hyp_put_page(struct hyp_pool *pool, void *addr)
{
struct hyp_page *p = hyp_virt_to_page(addr);
hyp_page_ref_inc(p);
hyp_spin_lock(&pool->lock);
__hyp_put_page(pool, p);
hyp_spin_unlock(&pool->lock);
}
void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
void hyp_get_page(struct hyp_pool *pool, void *addr)
{
unsigned int i = order;
struct hyp_page *p = hyp_virt_to_page(addr);
hyp_spin_lock(&pool->lock);
hyp_page_ref_inc(p);
hyp_spin_unlock(&pool->lock);
}
void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
{
unsigned short i = order;
struct hyp_page *p;
hyp_spin_lock(&pool->lock);
@ -156,11 +209,11 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
}
/* Extract it from the tree at the right order */
p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
p = node_to_page(pool->free_area[i].next);
p = __hyp_extract_page(pool, p, order);
hyp_spin_unlock(&pool->lock);
hyp_set_page_refcounted(p);
hyp_spin_unlock(&pool->lock);
return hyp_page_to_virt(p);
}
@ -181,15 +234,14 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
/* Init the vmemmap portion */
p = hyp_phys_to_page(phys);
memset(p, 0, sizeof(*p) * nr_pages);
for (i = 0; i < nr_pages; i++) {
p[i].pool = pool;
INIT_LIST_HEAD(&p[i].node);
p[i].order = 0;
hyp_set_page_refcounted(&p[i]);
}
/* Attach the unused pages to the buddy tree */
for (i = reserved_pages; i < nr_pages; i++)
__hyp_attach_page(pool, &p[i]);
__hyp_put_page(pool, &p[i]);
return 0;
}

View File

@ -24,8 +24,7 @@ unsigned long hyp_nr_cpus;
static void *vmemmap_base;
static void *hyp_pgt_base;
static void *host_s2_mem_pgt_base;
static void *host_s2_dev_pgt_base;
static void *host_s2_pgt_base;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static int divide_memory_pool(void *virt, unsigned long size)
@ -45,14 +44,9 @@ static int divide_memory_pool(void *virt, unsigned long size)
if (!hyp_pgt_base)
return -ENOMEM;
nr_pages = host_s2_mem_pgtable_pages();
host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
if (!host_s2_mem_pgt_base)
return -ENOMEM;
nr_pages = host_s2_dev_pgtable_pages();
host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
if (!host_s2_dev_pgt_base)
nr_pages = host_s2_pgtable_pages();
host_s2_pgt_base = hyp_early_alloc_contig(nr_pages);
if (!host_s2_pgt_base)
return -ENOMEM;
return 0;
@ -144,6 +138,16 @@ static void *hyp_zalloc_hyp_page(void *arg)
return hyp_alloc_pages(&hpool, 0);
}
static void hpool_get_page(void *addr)
{
hyp_get_page(&hpool, addr);
}
static void hpool_put_page(void *addr)
{
hyp_put_page(&hpool, addr);
}
void __noreturn __pkvm_init_finalise(void)
{
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@ -159,7 +163,7 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
ret = kvm_host_prepare_stage2(host_s2_pgt_base);
if (ret)
goto out;
@ -167,8 +171,8 @@ void __noreturn __pkvm_init_finalise(void)
.zalloc_page = hyp_zalloc_hyp_page,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.get_page = hyp_get_page,
.put_page = hyp_put_page,
.get_page = hpool_get_page,
.put_page = hpool_put_page,
};
pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;

View File

@ -577,12 +577,24 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
mm_ops->put_page(ptep);
}
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
}
static bool stage2_pte_executable(kvm_pte_t pte)
{
return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}
static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (!kvm_block_mapping_supported(addr, end, phys, level))
@ -606,6 +618,14 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
}
/* Perform CMOs before installation of the guest stage-2 PTE */
if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
granule);
if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
smp_store_release(ptep, new);
if (stage2_pte_is_counted(new))
mm_ops->get_page(ptep);
@ -798,12 +818,6 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
return ret;
}
static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
}
static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
@ -864,10 +878,11 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
}
struct stage2_attr_data {
kvm_pte_t attr_set;
kvm_pte_t attr_clr;
kvm_pte_t pte;
u32 level;
kvm_pte_t attr_set;
kvm_pte_t attr_clr;
kvm_pte_t pte;
u32 level;
struct kvm_pgtable_mm_ops *mm_ops;
};
static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
@ -876,6 +891,7 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
{
kvm_pte_t pte = *ptep;
struct stage2_attr_data *data = arg;
struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (!kvm_pte_valid(pte))
return 0;
@ -890,8 +906,17 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
* but worst-case the access flag update gets lost and will be
* set on the next access instead.
*/
if (data->pte != pte)
if (data->pte != pte) {
/*
* Invalidate instruction cache before updating the guest
* stage-2 PTE if we are going to add executable permission.
*/
if (mm_ops->icache_inval_pou &&
stage2_pte_executable(pte) && !stage2_pte_executable(*ptep))
mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
kvm_granule_size(level));
WRITE_ONCE(*ptep, pte);
}
return 0;
}
@ -906,6 +931,7 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
struct stage2_attr_data data = {
.attr_set = attr_set & attr_mask,
.attr_clr = attr_clr & attr_mask,
.mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_attr_walker,

View File

@ -71,8 +71,7 @@ void __init kvm_hyp_reserve(void)
}
hyp_mem_pages += hyp_s1_pgtable_pages();
hyp_mem_pages += host_s2_mem_pgtable_pages();
hyp_mem_pages += host_s2_dev_pgtable_pages();
hyp_mem_pages += host_s2_pgtable_pages();
/*
* The hyp_vmemmap needs to be backed by pages, but these pages

View File

@ -126,6 +126,16 @@ static void *kvm_host_va(phys_addr_t phys)
return __va(phys);
}
static void clean_dcache_guest_page(void *va, size_t size)
{
__clean_dcache_guest_page(va, size);
}
static void invalidate_icache_guest_page(void *va, size_t size)
{
__invalidate_icache_guest_page(va, size);
}
/*
* Unmapping vs dcache management:
*
@ -432,6 +442,8 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.page_count = kvm_host_page_count,
.phys_to_virt = kvm_host_va,
.virt_to_phys = kvm_host_pa,
.dcache_clean_inval_poc = clean_dcache_guest_page,
.icache_inval_pou = invalidate_icache_guest_page,
};
/**
@ -693,16 +705,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
{
__clean_dcache_guest_page(pfn, size);
}
static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
{
__invalidate_icache_guest_page(pfn, size);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
{
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
@ -822,6 +824,74 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
return PAGE_SIZE;
}
static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
{
unsigned long pa;
if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
return huge_page_shift(hstate_vma(vma));
if (!(vma->vm_flags & VM_PFNMAP))
return PAGE_SHIFT;
VM_BUG_ON(is_vm_hugetlb_page(vma));
pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
#ifndef __PAGETABLE_PMD_FOLDED
if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
ALIGN(hva, PUD_SIZE) <= vma->vm_end)
return PUD_SHIFT;
#endif
if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
ALIGN(hva, PMD_SIZE) <= vma->vm_end)
return PMD_SHIFT;
return PAGE_SHIFT;
}
/*
* The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
* able to see the page's tags and therefore they must be initialised first. If
* PG_mte_tagged is set, tags have already been initialised.
*
* The race in the test/set of the PG_mte_tagged flag is handled by:
* - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
* racing to santise the same page
* - mmap_lock protects between a VM faulting a page in and the VMM performing
* an mprotect() to add VM_MTE
*/
static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
unsigned long size)
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page;
if (!kvm_has_mte(kvm))
return 0;
/*
* pfn_to_online_page() is used to reject ZONE_DEVICE pages
* that may not support tags.
*/
page = pfn_to_online_page(pfn);
if (!page)
return -EFAULT;
for (i = 0; i < nr_pages; i++, page++) {
if (!test_bit(PG_mte_tagged, &page->flags)) {
mte_clear_page_tags(page_address(page));
set_bit(PG_mte_tagged, &page->flags);
}
}
return 0;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
@ -830,6 +900,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool write_fault, writable, force_pte = false;
bool exec_fault;
bool device = false;
bool shared;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@ -853,7 +924,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
/* Let's check if we will get back a huge page backed by hugetlbfs */
/*
* Let's check if we will get back a huge page backed by hugetlbfs, or
* get block mapping for device MMIO region.
*/
mmap_read_lock(current->mm);
vma = find_vma_intersection(current->mm, hva, hva + 1);
if (unlikely(!vma)) {
@ -862,17 +936,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
if (is_vm_hugetlb_page(vma))
vma_shift = huge_page_shift(hstate_vma(vma));
else
vma_shift = PAGE_SHIFT;
if (logging_active ||
(vma->vm_flags & VM_PFNMAP)) {
/*
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
if (logging_active) {
force_pte = true;
vma_shift = PAGE_SHIFT;
} else {
vma_shift = get_vma_page_shift(vma, hva);
}
shared = (vma->vm_flags & VM_PFNMAP);
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
@ -943,8 +1019,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
/*
* If the page was identified as device early by looking at
* the VMA flags, vma_pagesize is already representing the
* largest quantity we can map. If instead it was mapped
* via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
* and must not be upgraded.
*
* In both cases, we don't let transparent_hugepage_adjust()
* change things at the last minute.
*/
device = true;
force_pte = true;
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
@ -965,19 +1050,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
if (vma_pagesize == PAGE_SIZE && !force_pte)
if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new VM_SHARED VMA */
if (!shared)
ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
else
ret = -EFAULT;
if (ret)
goto out_unlock;
}
if (writable)
prot |= KVM_PGTABLE_PROT_W;
if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
if (exec_fault) {
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
invalidate_icache_guest_page(pfn, vma_pagesize);
}
if (device)
prot |= KVM_PGTABLE_PROT_DEVICE;
@ -1168,19 +1259,22 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_pfn_t pfn = pte_pfn(range->pte);
int ret;
if (!kvm->arch.mmu.pgt)
return false;
WARN_ON(range->end - range->start != 1);
/*
* We've moved a page around, probably through CoW, so let's treat it
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
if (ret)
return false;
/*
* We've moved a page around, probably through CoW, so let's treat
* it just like a translation fault and the map handler will clean
* the cache to the PoC.
*
* The MMU notifiers will have unmapped a huge PMD before calling
* ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
* therefore we never need to clear out a huge PMD through this
@ -1346,7 +1440,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
{
hva_t hva = mem->userspace_addr;
hva_t reg_end = hva + mem->memory_size;
bool writable = !(mem->flags & KVM_MEM_READONLY);
int ret = 0;
if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
@ -1363,8 +1456,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
mmap_read_lock(current->mm);
/*
* A memory region could potentially cover multiple VMAs, and any holes
* between them, so iterate over all of them to find out if we can map
* any of them right now.
* between them, so iterate over all of them.
*
* +--------------------------------------------+
* +---------------+----------------+ +----------------+
@ -1375,51 +1467,29 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
*/
do {
struct vm_area_struct *vma;
hva_t vm_start, vm_end;
vma = find_vma_intersection(current->mm, hva, reg_end);
if (!vma)
break;
/*
* Take the intersection of this VMA with the memory region
* VM_SHARED mappings are not allowed with MTE to avoid races
* when updating the PG_mte_tagged page flag, see
* sanitise_mte_tags for more details.
*/
vm_start = max(hva, vma->vm_start);
vm_end = min(reg_end, vma->vm_end);
if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
return -EINVAL;
if (vma->vm_flags & VM_PFNMAP) {
gpa_t gpa = mem->guest_phys_addr +
(vm_start - mem->userspace_addr);
phys_addr_t pa;
pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
pa += vm_start - vma->vm_start;
/* IO region dirty page logging not allowed */
if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
ret = -EINVAL;
goto out;
}
ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
vm_end - vm_start,
writable);
if (ret)
break;
}
}
hva = vm_end;
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
if (change == KVM_MR_FLAGS_ONLY)
goto out;
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
out:
mmap_read_unlock(current->mm);
return ret;
}

View File

@ -578,6 +578,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_counter_value(vcpu, i, 0);
}
@ -850,6 +851,9 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
/* One-off reload of the PMU on first run */
kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
return 0;
}

View File

@ -176,6 +176,10 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu)
if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
return false;
/* MTE is incompatible with AArch32 */
if (kvm_has_mte(vcpu->kvm) && is32bit)
return false;
/* Check that the vcpus are either all 32bit or all 64bit */
kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit)

View File

@ -1047,6 +1047,13 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64PFR1_EL1:
val &= ~FEATURE(ID_AA64PFR1_MTE);
if (kvm_has_mte(vcpu->kvm)) {
u64 pfr, mte;
pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
}
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
@ -1302,6 +1309,23 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return true;
}
static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
if (kvm_has_mte(vcpu->kvm))
return 0;
return REG_HIDDEN;
}
#define MTE_REG(name) { \
SYS_DESC(SYS_##name), \
.access = undef_access, \
.reset = reset_unknown, \
.reg = name, \
.visibility = mte_visibility, \
}
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
SYS_DESC(SYS_##name), \
@ -1470,8 +1494,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
{ SYS_DESC(SYS_RGSR_EL1), undef_access },
{ SYS_DESC(SYS_GCR_EL1), undef_access },
MTE_REG(RGSR_EL1),
MTE_REG(GCR_EL1),
{ SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
{ SYS_DESC(SYS_TRFCR_EL1), undef_access },
@ -1498,8 +1522,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
{ SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
{ SYS_DESC(SYS_TFSR_EL1), undef_access },
{ SYS_DESC(SYS_TFSRE0_EL1), undef_access },
MTE_REG(TFSR_EL1),
MTE_REG(TFSRE0_EL1),
{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },

View File

@ -482,6 +482,16 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
return IRQ_HANDLED;
}
static struct gic_kvm_info *gic_kvm_info;
void __init vgic_set_kvm_info(const struct gic_kvm_info *info)
{
BUG_ON(gic_kvm_info != NULL);
gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL);
if (gic_kvm_info)
*gic_kvm_info = *info;
}
/**
* kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
*
@ -509,18 +519,29 @@ void kvm_vgic_init_cpu_hardware(void)
*/
int kvm_vgic_hyp_init(void)
{
const struct gic_kvm_info *gic_kvm_info;
bool has_mask;
int ret;
gic_kvm_info = gic_get_kvm_info();
if (!gic_kvm_info)
return -ENODEV;
if (!gic_kvm_info->maint_irq) {
has_mask = !gic_kvm_info->no_maint_irq_mask;
if (has_mask && !gic_kvm_info->maint_irq) {
kvm_err("No vgic maintenance irq\n");
return -ENXIO;
}
/*
* If we get one of these oddball non-GICs, taint the kernel,
* as we have no idea of how they *really* behave.
*/
if (gic_kvm_info->no_hw_deactivation) {
kvm_info("Non-architectural vgic, tainting kernel\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
kvm_vgic_global_state.no_hw_deactivation = true;
}
switch (gic_kvm_info->type) {
case GIC_V2:
ret = vgic_v2_probe(gic_kvm_info);
@ -536,10 +557,17 @@ int kvm_vgic_hyp_init(void)
ret = -ENODEV;
}
kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
kfree(gic_kvm_info);
gic_kvm_info = NULL;
if (ret)
return ret;
kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
if (!has_mask)
return 0;
ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
vgic_maintenance_handler,
"vgic", kvm_get_running_vcpus());

View File

@ -108,11 +108,22 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
* If this causes us to lower the level, we have to also clear
* the physical active state, since we will otherwise never be
* told when the interrupt becomes asserted again.
*
* Another case is when the interrupt requires a helping hand
* on deactivation (no HW deactivation, for example).
*/
if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
irq->line_level = vgic_get_phys_line_level(irq);
if (vgic_irq_is_mapped_level(irq)) {
bool resample = false;
if (!irq->line_level)
if (val & GICH_LR_PENDING_BIT) {
irq->line_level = vgic_get_phys_line_level(irq);
resample = !irq->line_level;
} else if (vgic_irq_needs_resampling(irq) &&
!(irq->active || irq->pending_latch)) {
resample = true;
}
if (resample)
vgic_irq_set_phys_active(irq, false);
}
@ -152,7 +163,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (irq->group)
val |= GICH_LR_GROUP1;
if (irq->hw) {
if (irq->hw && !vgic_irq_needs_resampling(irq)) {
val |= GICH_LR_HW;
val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
/*

View File

@ -101,11 +101,22 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
* If this causes us to lower the level, we have to also clear
* the physical active state, since we will otherwise never be
* told when the interrupt becomes asserted again.
*
* Another case is when the interrupt requires a helping hand
* on deactivation (no HW deactivation, for example).
*/
if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
irq->line_level = vgic_get_phys_line_level(irq);
if (vgic_irq_is_mapped_level(irq)) {
bool resample = false;
if (!irq->line_level)
if (val & ICH_LR_PENDING_BIT) {
irq->line_level = vgic_get_phys_line_level(irq);
resample = !irq->line_level;
} else if (vgic_irq_needs_resampling(irq) &&
!(irq->active || irq->pending_latch)) {
resample = true;
}
if (resample)
vgic_irq_set_phys_active(irq, false);
}
@ -136,7 +147,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
}
}
if (irq->hw) {
if (irq->hw && !vgic_irq_needs_resampling(irq)) {
val |= ICH_LR_HW;
val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
/*

View File

@ -182,8 +182,8 @@ bool vgic_get_phys_line_level(struct vgic_irq *irq)
BUG_ON(!irq->hw);
if (irq->get_input_level)
return irq->get_input_level(irq->intid);
if (irq->ops && irq->ops->get_input_level)
return irq->ops->get_input_level(irq->intid);
WARN_ON(irq_get_irqchip_state(irq->host_irq,
IRQCHIP_STATE_PENDING,
@ -480,7 +480,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
/* @irq->irq_lock must be held */
static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
unsigned int host_irq,
bool (*get_input_level)(int vindid))
struct irq_ops *ops)
{
struct irq_desc *desc;
struct irq_data *data;
@ -500,7 +500,7 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
irq->hw = true;
irq->host_irq = host_irq;
irq->hwintid = data->hwirq;
irq->get_input_level = get_input_level;
irq->ops = ops;
return 0;
}
@ -509,11 +509,11 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
{
irq->hw = false;
irq->hwintid = 0;
irq->get_input_level = NULL;
irq->ops = NULL;
}
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
u32 vintid, bool (*get_input_level)(int vindid))
u32 vintid, struct irq_ops *ops)
{
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
unsigned long flags;
@ -522,7 +522,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
BUG_ON(!irq);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops);
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);

View File

@ -109,10 +109,11 @@ static inline bool kvm_is_error_hva(unsigned long addr)
}
struct kvm_vm_stat {
ulong remote_tlb_flush;
struct kvm_vm_stat_generic generic;
};
struct kvm_vcpu_stat {
struct kvm_vcpu_stat_generic generic;
u64 wait_exits;
u64 cache_exits;
u64 signal_exits;
@ -142,12 +143,6 @@ struct kvm_vcpu_stat {
#ifdef CONFIG_CPU_LOONGSON64
u64 vz_cpucfg_exits;
#endif
u64 halt_successful_poll;
u64 halt_attempted_poll;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 halt_poll_invalid;
u64 halt_wakeup;
};
struct kvm_arch_memory_slot {

View File

@ -2,7 +2,7 @@
# Makefile for KVM support for MIPS
#
common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o)
common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm

View File

@ -38,43 +38,63 @@
#define VECTORSPACING 0x100 /* for EI/VI mode */
#endif
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("wait", wait_exits),
VCPU_STAT("cache", cache_exits),
VCPU_STAT("signal", signal_exits),
VCPU_STAT("interrupt", int_exits),
VCPU_STAT("cop_unusable", cop_unusable_exits),
VCPU_STAT("tlbmod", tlbmod_exits),
VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits),
VCPU_STAT("tlbmiss_st", tlbmiss_st_exits),
VCPU_STAT("addrerr_st", addrerr_st_exits),
VCPU_STAT("addrerr_ld", addrerr_ld_exits),
VCPU_STAT("syscall", syscall_exits),
VCPU_STAT("resvd_inst", resvd_inst_exits),
VCPU_STAT("break_inst", break_inst_exits),
VCPU_STAT("trap_inst", trap_inst_exits),
VCPU_STAT("msa_fpe", msa_fpe_exits),
VCPU_STAT("fpe", fpe_exits),
VCPU_STAT("msa_disabled", msa_disabled_exits),
VCPU_STAT("flush_dcache", flush_dcache_exits),
VCPU_STAT("vz_gpsi", vz_gpsi_exits),
VCPU_STAT("vz_gsfc", vz_gsfc_exits),
VCPU_STAT("vz_hc", vz_hc_exits),
VCPU_STAT("vz_grr", vz_grr_exits),
VCPU_STAT("vz_gva", vz_gva_exits),
VCPU_STAT("vz_ghfc", vz_ghfc_exits),
VCPU_STAT("vz_gpa", vz_gpa_exits),
VCPU_STAT("vz_resvd", vz_resvd_exits),
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS()
};
static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vm_stats_desc),
};
const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, wait_exits),
STATS_DESC_COUNTER(VCPU, cache_exits),
STATS_DESC_COUNTER(VCPU, signal_exits),
STATS_DESC_COUNTER(VCPU, int_exits),
STATS_DESC_COUNTER(VCPU, cop_unusable_exits),
STATS_DESC_COUNTER(VCPU, tlbmod_exits),
STATS_DESC_COUNTER(VCPU, tlbmiss_ld_exits),
STATS_DESC_COUNTER(VCPU, tlbmiss_st_exits),
STATS_DESC_COUNTER(VCPU, addrerr_st_exits),
STATS_DESC_COUNTER(VCPU, addrerr_ld_exits),
STATS_DESC_COUNTER(VCPU, syscall_exits),
STATS_DESC_COUNTER(VCPU, resvd_inst_exits),
STATS_DESC_COUNTER(VCPU, break_inst_exits),
STATS_DESC_COUNTER(VCPU, trap_inst_exits),
STATS_DESC_COUNTER(VCPU, msa_fpe_exits),
STATS_DESC_COUNTER(VCPU, fpe_exits),
STATS_DESC_COUNTER(VCPU, msa_disabled_exits),
STATS_DESC_COUNTER(VCPU, flush_dcache_exits),
STATS_DESC_COUNTER(VCPU, vz_gpsi_exits),
STATS_DESC_COUNTER(VCPU, vz_gsfc_exits),
STATS_DESC_COUNTER(VCPU, vz_hc_exits),
STATS_DESC_COUNTER(VCPU, vz_grr_exits),
STATS_DESC_COUNTER(VCPU, vz_gva_exits),
STATS_DESC_COUNTER(VCPU, vz_ghfc_exits),
STATS_DESC_COUNTER(VCPU, vz_gpa_exits),
STATS_DESC_COUNTER(VCPU, vz_resvd_exits),
#ifdef CONFIG_CPU_LOONGSON64
VCPU_STAT("vz_cpucfg", vz_cpucfg_exits),
STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits),
#endif
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
{NULL}
};
static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vcpu_stats_desc),
};
bool kvm_trace_guest_mode_change;

View File

@ -120,6 +120,7 @@ extern s32 patch__call_flush_branch_caches3;
extern s32 patch__flush_count_cache_return;
extern s32 patch__flush_link_stack_return;
extern s32 patch__call_kvm_flush_link_stack;
extern s32 patch__call_kvm_flush_link_stack_p9;
extern s32 patch__memset_nocache, patch__memcpy_nocache;
extern long flush_branch_caches;
@ -140,7 +141,7 @@ void kvmhv_load_host_pmu(void);
void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,

View File

@ -19,6 +19,7 @@ struct mmu_psize_def {
int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
unsigned int tlbiel; /* tlbiel supported for that page size */
unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
unsigned long h_rpt_pgsize; /* H_RPT_INVALIDATE page size encoding */
union {
unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
unsigned long ap; /* Ap encoding used by PowerISA 3.0 */

View File

@ -4,6 +4,10 @@
#include <asm/hvcall.h>
#define RIC_FLUSH_TLB 0
#define RIC_FLUSH_PWC 1
#define RIC_FLUSH_ALL 2
struct vm_area_struct;
struct mm_struct;
struct mmu_gather;

View File

@ -98,6 +98,36 @@ static inline int cpu_last_thread_sibling(int cpu)
return cpu | (threads_per_core - 1);
}
/*
* tlb_thread_siblings are siblings which share a TLB. This is not
* architected, is not something a hypervisor could emulate and a future
* CPU may change behaviour even in compat mode, so this should only be
* used on PowerNV, and only with care.
*/
static inline int cpu_first_tlb_thread_sibling(int cpu)
{
if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
return cpu & ~0x6; /* Big Core */
else
return cpu_first_thread_sibling(cpu);
}
static inline int cpu_last_tlb_thread_sibling(int cpu)
{
if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
return cpu | 0x6; /* Big Core */
else
return cpu_last_thread_sibling(cpu);
}
static inline int cpu_tlb_thread_sibling_step(void)
{
if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
return 2; /* Big Core */
else
return 1;
}
static inline u32 get_tensr(void)
{
#ifdef CONFIG_BOOKE

View File

@ -35,6 +35,19 @@
/* PACA save area size in u64 units (exgen, exmc, etc) */
#define EX_SIZE 10
/* PACA save area offsets */
#define EX_R9 0
#define EX_R10 8
#define EX_R11 16
#define EX_R12 24
#define EX_R13 32
#define EX_DAR 40
#define EX_DSISR 48
#define EX_CCR 52
#define EX_CFAR 56
#define EX_PPR 64
#define EX_CTR 72
/*
* maximum recursive depth of MCE exceptions
*/

View File

@ -413,9 +413,9 @@
#define H_RPTI_TYPE_NESTED 0x0001 /* Invalidate nested guest partition-scope */
#define H_RPTI_TYPE_TLB 0x0002 /* Invalidate TLB */
#define H_RPTI_TYPE_PWC 0x0004 /* Invalidate Page Walk Cache */
/* Invalidate Process Table Entries if H_RPTI_TYPE_NESTED is clear */
/* Invalidate caching of Process Table Entries if H_RPTI_TYPE_NESTED is clear */
#define H_RPTI_TYPE_PRT 0x0008
/* Invalidate Partition Table Entries if H_RPTI_TYPE_NESTED is set */
/* Invalidate caching of Partition Table Entries if H_RPTI_TYPE_NESTED is set */
#define H_RPTI_TYPE_PAT 0x0008
#define H_RPTI_TYPE_ALL (H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
H_RPTI_TYPE_PRT)

View File

@ -147,6 +147,7 @@
#define KVM_GUEST_MODE_SKIP 2
#define KVM_GUEST_MODE_GUEST_HV 3
#define KVM_GUEST_MODE_HOST_HV 4
#define KVM_GUEST_MODE_HV_P9 5 /* ISA >= v3.0 path */
#define KVM_INST_FETCH_FAILED -1

View File

@ -307,6 +307,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
void kvmhv_release_all_nested(struct kvm *kvm);
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid,
unsigned long type, unsigned long pg_sizes,
unsigned long start, unsigned long end);
int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
u64 time_limit, unsigned long lpcr);
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);

View File

@ -153,9 +153,17 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
return radix;
}
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr);
#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
#endif
/*
* Invalid HDSISR value which is used to indicate when HW has not set the reg.
* Used to work around an errata.
*/
#define HDSISR_CANARY 0x7fff
/*
* We use a lock bit in HPTE dword 0 to synchronize updates and
* accesses to each HPTE, and another bit to indicate non-present

View File

@ -81,12 +81,13 @@ struct kvmppc_book3s_shadow_vcpu;
struct kvm_nested_guest;
struct kvm_vm_stat {
ulong remote_tlb_flush;
ulong num_2M_pages;
ulong num_1G_pages;
struct kvm_vm_stat_generic generic;
u64 num_2M_pages;
u64 num_1G_pages;
};
struct kvm_vcpu_stat {
struct kvm_vcpu_stat_generic generic;
u64 sum_exits;
u64 mmio_exits;
u64 signal_exits;
@ -102,14 +103,8 @@ struct kvm_vcpu_stat {
u64 emulated_inst_exits;
u64 dec_exits;
u64 ext_intr_exits;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 halt_wait_ns;
u64 halt_successful_poll;
u64 halt_attempted_poll;
u64 halt_successful_wait;
u64 halt_poll_invalid;
u64 halt_wakeup;
u64 dbell_exits;
u64 gdbell_exits;
u64 ld;
@ -298,7 +293,6 @@ struct kvm_arch {
u8 fwnmi_enabled;
u8 secure_guest;
u8 svm_enabled;
bool threads_indep;
bool nested_enable;
bool dawr1_enabled;
pgd_t *pgtable;
@ -684,7 +678,12 @@ struct kvm_vcpu_arch {
ulong fault_dar;
u32 fault_dsisr;
unsigned long intr_msr;
ulong fault_gpa; /* guest real address of page fault (POWER9) */
/*
* POWER9 and later: fault_gpa contains the guest real address of page
* fault for a radix guest, or segment descriptor (equivalent to result
* from slbmfev of SLB entry that translated the EA) for hash guests.
*/
ulong fault_gpa;
#endif
#ifdef CONFIG_BOOKE

View File

@ -129,6 +129,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags);
extern void kvmppc_core_queue_syscall(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
@ -606,6 +607,7 @@ extern void kvmppc_free_pimap(struct kvm *kvm);
extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req);
extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
@ -638,6 +640,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
{ return 0; }
static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
{ return 0; }
#endif
#ifdef CONFIG_KVM_XIVE
@ -655,8 +659,6 @@ extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
u32 *priority);
extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
extern void kvmppc_xive_init_module(void);
extern void kvmppc_xive_exit_module(void);
extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
@ -671,6 +673,8 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status);
extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
extern void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu);
static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{
@ -680,8 +684,6 @@ static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
extern void kvmppc_xive_native_init_module(void);
extern void kvmppc_xive_native_exit_module(void);
extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
union kvmppc_one_reg *val);
extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
@ -695,8 +697,6 @@ static inline int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
u32 *priority) { return -1; }
static inline int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) { return -1; }
static inline int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) { return -1; }
static inline void kvmppc_xive_init_module(void) { }
static inline void kvmppc_xive_exit_module(void) { }
static inline int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
@ -711,14 +711,14 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status) { return -ENODEV; }
static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
static inline void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { }
static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
static inline void kvmppc_xive_native_init_module(void) { }
static inline void kvmppc_xive_native_exit_module(void) { }
static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
union kvmppc_one_reg *val)
{ return 0; }
@ -754,7 +754,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
unsigned long tce_value, unsigned long npages);
long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
unsigned int yield_count);
long kvmppc_h_random(struct kvm_vcpu *vcpu);
long kvmppc_rm_h_random(struct kvm_vcpu *vcpu);
void kvmhv_commence_exit(int trap);
void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
void kvmppc_subcore_enter_guest(void);

View File

@ -122,12 +122,6 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea)
}
#endif
#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
#else
static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
#endif
extern void switch_cop(struct mm_struct *next);
extern int use_cop(unsigned long acop, struct mm_struct *mm);
extern void drop_cop(unsigned long acop, struct mm_struct *mm);
@ -222,6 +216,18 @@ static inline void mm_context_add_copro(struct mm_struct *mm) { }
static inline void mm_context_remove_copro(struct mm_struct *mm) { }
#endif
#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
unsigned long type, unsigned long pg_sizes,
unsigned long start, unsigned long end);
#else
static inline void do_h_rpt_invalidate_prt(unsigned long pid,
unsigned long lpid,
unsigned long type,
unsigned long pg_sizes,
unsigned long start,
unsigned long end) { }
#endif
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);

View File

@ -97,6 +97,18 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
extern void secondary_cpu_time_init(void);
extern void __init time_init(void);
#ifdef CONFIG_PPC64
static inline unsigned long test_irq_work_pending(void)
{
unsigned long x;
asm volatile("lbz %0,%1(13)"
: "=r" (x)
: "i" (offsetof(struct paca_struct, irq_work_pending)));
return x;
}
#endif
DECLARE_PER_CPU(u64, decrementers_next_tb);
/* Convert timebase ticks to nanoseconds */

View File

@ -534,7 +534,6 @@ int main(void)
OFFSET(VCPU_SLB_NR, kvm_vcpu, arch.slb_nr);
OFFSET(VCPU_FAULT_DSISR, kvm_vcpu, arch.fault_dsisr);
OFFSET(VCPU_FAULT_DAR, kvm_vcpu, arch.fault_dar);
OFFSET(VCPU_FAULT_GPA, kvm_vcpu, arch.fault_gpa);
OFFSET(VCPU_INTR_MSR, kvm_vcpu, arch.intr_msr);
OFFSET(VCPU_LAST_INST, kvm_vcpu, arch.last_inst);
OFFSET(VCPU_TRAP, kvm_vcpu, arch.trap);

View File

@ -21,22 +21,6 @@
#include <asm/feature-fixups.h>
#include <asm/kup.h>
/* PACA save area offsets (exgen, exmc, etc) */
#define EX_R9 0
#define EX_R10 8
#define EX_R11 16
#define EX_R12 24
#define EX_R13 32
#define EX_DAR 40
#define EX_DSISR 48
#define EX_CCR 52
#define EX_CFAR 56
#define EX_PPR 64
#define EX_CTR 72
.if EX_SIZE != 10
.error "EX_SIZE is wrong"
.endif
/*
* Following are fixed section helper macros.
*
@ -133,7 +117,6 @@ name:
#define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */
#define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() /* Common runs in realmode */
#define IMASK .L_IMASK_\name\() /* IRQ soft-mask bit */
#define IKVM_SKIP .L_IKVM_SKIP_\name\() /* Generate KVM skip handler */
#define IKVM_REAL .L_IKVM_REAL_\name\() /* Real entry tests KVM */
#define __IKVM_REAL(name) .L_IKVM_REAL_ ## name
#define IKVM_VIRT .L_IKVM_VIRT_\name\() /* Virt entry tests KVM */
@ -190,9 +173,6 @@ do_define_int n
.ifndef IMASK
IMASK=0
.endif
.ifndef IKVM_SKIP
IKVM_SKIP=0
.endif
.ifndef IKVM_REAL
IKVM_REAL=0
.endif
@ -207,8 +187,6 @@ do_define_int n
.endif
.endm
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/*
* All interrupts which set HSRR registers, as well as SRESET and MCE and
* syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken,
@ -238,88 +216,28 @@ do_define_int n
/*
* If an interrupt is taken while a guest is running, it is immediately routed
* to KVM to handle. If both HV and PR KVM arepossible, KVM interrupts go first
* to kvmppc_interrupt_hv, which handles the PR guest case.
* to KVM to handle.
*/
#define kvmppc_interrupt kvmppc_interrupt_hv
#else
#define kvmppc_interrupt kvmppc_interrupt_pr
#endif
.macro KVMTEST name
.macro KVMTEST name handler
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
lbz r10,HSTATE_IN_GUEST(r13)
cmpwi r10,0
bne \name\()_kvm
.endm
.macro GEN_KVM name
.balign IFETCH_ALIGN_BYTES
\name\()_kvm:
.if IKVM_SKIP
cmpwi r10,KVM_GUEST_MODE_SKIP
beq 89f
.else
BEGIN_FTR_SECTION
ld r10,IAREA+EX_CFAR(r13)
std r10,HSTATE_CFAR(r13)
END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
.endif
ld r10,IAREA+EX_CTR(r13)
mtctr r10
BEGIN_FTR_SECTION
ld r10,IAREA+EX_PPR(r13)
std r10,HSTATE_PPR(r13)
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r11,IAREA+EX_R11(r13)
ld r12,IAREA+EX_R12(r13)
std r12,HSTATE_SCRATCH0(r13)
sldi r12,r9,32
ld r9,IAREA+EX_R9(r13)
ld r10,IAREA+EX_R10(r13)
/* HSRR variants have the 0x2 bit added to their trap number */
.if IHSRR_IF_HVMODE
BEGIN_FTR_SECTION
ori r12,r12,(IVEC + 0x2)
li r10,(IVEC + 0x2)
FTR_SECTION_ELSE
ori r12,r12,(IVEC)
li r10,(IVEC)
ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
.elseif IHSRR
ori r12,r12,(IVEC+ 0x2)
li r10,(IVEC + 0x2)
.else
ori r12,r12,(IVEC)
li r10,(IVEC)
.endif
b kvmppc_interrupt
.if IKVM_SKIP
89: mtocrf 0x80,r9
ld r10,IAREA+EX_CTR(r13)
mtctr r10
ld r9,IAREA+EX_R9(r13)
ld r10,IAREA+EX_R10(r13)
ld r11,IAREA+EX_R11(r13)
ld r12,IAREA+EX_R12(r13)
.if IHSRR_IF_HVMODE
BEGIN_FTR_SECTION
b kvmppc_skip_Hinterrupt
FTR_SECTION_ELSE
b kvmppc_skip_interrupt
ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
.elseif IHSRR
b kvmppc_skip_Hinterrupt
.else
b kvmppc_skip_interrupt
.endif
.endif
.endm
#else
.macro KVMTEST name
.endm
.macro GEN_KVM name
.endm
bne \handler
#endif
.endm
/*
* This is the BOOK3S interrupt entry code macro.
@ -461,7 +379,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
DEFINE_FIXED_SYMBOL(\name\()_common_real)
\name\()_common_real:
.if IKVM_REAL
KVMTEST \name
KVMTEST \name kvm_interrupt
.endif
ld r10,PACAKMSR(r13) /* get MSR value for kernel */
@ -484,7 +402,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
DEFINE_FIXED_SYMBOL(\name\()_common_virt)
\name\()_common_virt:
.if IKVM_VIRT
KVMTEST \name
KVMTEST \name kvm_interrupt
1:
.endif
.endif /* IVIRT */
@ -498,7 +416,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt)
DEFINE_FIXED_SYMBOL(\name\()_common_real)
\name\()_common_real:
.if IKVM_REAL
KVMTEST \name
KVMTEST \name kvm_interrupt
.endif
.endm
@ -1000,8 +918,6 @@ EXC_COMMON_BEGIN(system_reset_common)
EXCEPTION_RESTORE_REGS
RFI_TO_USER_OR_KERNEL
GEN_KVM system_reset
/**
* Interrupt 0x200 - Machine Check Interrupt (MCE).
@ -1070,7 +986,6 @@ INT_DEFINE_BEGIN(machine_check)
ISET_RI=0
IDAR=1
IDSISR=1
IKVM_SKIP=1
IKVM_REAL=1
INT_DEFINE_END(machine_check)
@ -1166,7 +1081,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
/*
* Check if we are coming from guest. If yes, then run the normal
* exception handler which will take the
* machine_check_kvm->kvmppc_interrupt branch to deliver the MC event
* machine_check_kvm->kvm_interrupt branch to deliver the MC event
* to guest.
*/
lbz r11,HSTATE_IN_GUEST(r13)
@ -1236,8 +1151,6 @@ EXC_COMMON_BEGIN(machine_check_common)
bl machine_check_exception
b interrupt_return
GEN_KVM machine_check
#ifdef CONFIG_PPC_P7_NAP
/*
@ -1342,7 +1255,6 @@ INT_DEFINE_BEGIN(data_access)
IVEC=0x300
IDAR=1
IDSISR=1
IKVM_SKIP=1
IKVM_REAL=1
INT_DEFINE_END(data_access)
@ -1373,8 +1285,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
REST_NVGPRS(r1)
b interrupt_return
GEN_KVM data_access
/**
* Interrupt 0x380 - Data Segment Interrupt (DSLB).
@ -1396,7 +1306,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
INT_DEFINE_BEGIN(data_access_slb)
IVEC=0x380
IDAR=1
IKVM_SKIP=1
IKVM_REAL=1
INT_DEFINE_END(data_access_slb)
@ -1425,8 +1334,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
bl do_bad_slb_fault
b interrupt_return
GEN_KVM data_access_slb
/**
* Interrupt 0x400 - Instruction Storage Interrupt (ISI).
@ -1463,8 +1370,6 @@ MMU_FTR_SECTION_ELSE
ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
b interrupt_return
GEN_KVM instruction_access
/**
* Interrupt 0x480 - Instruction Segment Interrupt (ISLB).
@ -1509,8 +1414,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
bl do_bad_slb_fault
b interrupt_return
GEN_KVM instruction_access_slb
/**
* Interrupt 0x500 - External Interrupt.
@ -1555,8 +1458,6 @@ EXC_COMMON_BEGIN(hardware_interrupt_common)
bl do_IRQ
b interrupt_return
GEN_KVM hardware_interrupt
/**
* Interrupt 0x600 - Alignment Interrupt
@ -1584,8 +1485,6 @@ EXC_COMMON_BEGIN(alignment_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
GEN_KVM alignment
/**
* Interrupt 0x700 - Program Interrupt (program check).
@ -1693,8 +1592,6 @@ EXC_COMMON_BEGIN(program_check_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
GEN_KVM program_check
/*
* Interrupt 0x800 - Floating-Point Unavailable Interrupt.
@ -1744,8 +1641,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
b interrupt_return
#endif
GEN_KVM fp_unavailable
/**
* Interrupt 0x900 - Decrementer Interrupt.
@ -1784,8 +1679,6 @@ EXC_COMMON_BEGIN(decrementer_common)
bl timer_interrupt
b interrupt_return
GEN_KVM decrementer
/**
* Interrupt 0x980 - Hypervisor Decrementer Interrupt.
@ -1831,8 +1724,6 @@ EXC_COMMON_BEGIN(hdecrementer_common)
ld r13,PACA_EXGEN+EX_R13(r13)
HRFI_TO_KERNEL
GEN_KVM hdecrementer
/**
* Interrupt 0xa00 - Directed Privileged Doorbell Interrupt.
@ -1872,8 +1763,6 @@ EXC_COMMON_BEGIN(doorbell_super_common)
#endif
b interrupt_return
GEN_KVM doorbell_super
EXC_REAL_NONE(0xb00, 0x100)
EXC_VIRT_NONE(0x4b00, 0x100)
@ -1923,7 +1812,7 @@ INT_DEFINE_END(system_call)
GET_PACA(r13)
std r10,PACA_EXGEN+EX_R10(r13)
INTERRUPT_TO_KERNEL
KVMTEST system_call /* uses r10, branch to system_call_kvm */
KVMTEST system_call kvm_hcall /* uses r10, branch to kvm_hcall */
mfctr r9
#else
mr r9,r13
@ -1979,14 +1868,16 @@ EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
EXC_VIRT_END(system_call, 0x4c00, 0x100)
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
TRAMP_REAL_BEGIN(system_call_kvm)
/*
* This is a hcall, so register convention is as above, with these
* differences:
* r13 = PACA
* ctr = orig r13
* orig r10 saved in PACA
*/
TRAMP_REAL_BEGIN(kvm_hcall)
std r9,PACA_EXGEN+EX_R9(r13)
std r11,PACA_EXGEN+EX_R11(r13)
std r12,PACA_EXGEN+EX_R12(r13)
mfcr r9
mfctr r10
std r10,PACA_EXGEN+EX_R13(r13)
li r10,0
std r10,PACA_EXGEN+EX_CFAR(r13)
std r10,PACA_EXGEN+EX_CTR(r13)
/*
* Save the PPR (on systems that support it) before changing to
* HMT_MEDIUM. That allows the KVM code to save that value into the
@ -1994,31 +1885,24 @@ TRAMP_REAL_BEGIN(system_call_kvm)
*/
BEGIN_FTR_SECTION
mfspr r10,SPRN_PPR
std r10,HSTATE_PPR(r13)
std r10,PACA_EXGEN+EX_PPR(r13)
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
HMT_MEDIUM
mfctr r10
SET_SCRATCH0(r10)
mfcr r10
std r12,HSTATE_SCRATCH0(r13)
sldi r12,r10,32
ori r12,r12,0xc00
#ifdef CONFIG_RELOCATABLE
/*
* Requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives
* Requires __LOAD_FAR_HANDLER beause kvmppc_hcall lives
* outside the head section.
*/
__LOAD_FAR_HANDLER(r10, kvmppc_interrupt)
__LOAD_FAR_HANDLER(r10, kvmppc_hcall)
mtctr r10
ld r10,PACA_EXGEN+EX_R10(r13)
bctr
#else
ld r10,PACA_EXGEN+EX_R10(r13)
b kvmppc_interrupt
b kvmppc_hcall
#endif
#endif
/**
* Interrupt 0xd00 - Trace Interrupt.
* This is a synchronous interrupt in response to instruction step or
@ -2043,8 +1927,6 @@ EXC_COMMON_BEGIN(single_step_common)
bl single_step_exception
b interrupt_return
GEN_KVM single_step
/**
* Interrupt 0xe00 - Hypervisor Data Storage Interrupt (HDSI).
@ -2063,7 +1945,6 @@ INT_DEFINE_BEGIN(h_data_storage)
IHSRR=1
IDAR=1
IDSISR=1
IKVM_SKIP=1
IKVM_REAL=1
IKVM_VIRT=1
INT_DEFINE_END(h_data_storage)
@ -2084,8 +1965,6 @@ MMU_FTR_SECTION_ELSE
ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
b interrupt_return
GEN_KVM h_data_storage
/**
* Interrupt 0xe20 - Hypervisor Instruction Storage Interrupt (HISI).
@ -2111,8 +1990,6 @@ EXC_COMMON_BEGIN(h_instr_storage_common)
bl unknown_exception
b interrupt_return
GEN_KVM h_instr_storage
/**
* Interrupt 0xe40 - Hypervisor Emulation Assistance Interrupt.
@ -2137,8 +2014,6 @@ EXC_COMMON_BEGIN(emulation_assist_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
GEN_KVM emulation_assist
/**
* Interrupt 0xe60 - Hypervisor Maintenance Interrupt (HMI).
@ -2210,16 +2085,12 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
EXCEPTION_RESTORE_REGS hsrr=1
GEN_INT_ENTRY hmi_exception, virt=0
GEN_KVM hmi_exception_early
EXC_COMMON_BEGIN(hmi_exception_common)
GEN_COMMON hmi_exception
addi r3,r1,STACK_FRAME_OVERHEAD
bl handle_hmi_exception
b interrupt_return
GEN_KVM hmi_exception
/**
* Interrupt 0xe80 - Directed Hypervisor Doorbell Interrupt.
@ -2250,8 +2121,6 @@ EXC_COMMON_BEGIN(h_doorbell_common)
#endif
b interrupt_return
GEN_KVM h_doorbell
/**
* Interrupt 0xea0 - Hypervisor Virtualization Interrupt.
@ -2278,8 +2147,6 @@ EXC_COMMON_BEGIN(h_virt_irq_common)
bl do_IRQ
b interrupt_return
GEN_KVM h_virt_irq
EXC_REAL_NONE(0xec0, 0x20)
EXC_VIRT_NONE(0x4ec0, 0x20)
@ -2323,8 +2190,6 @@ EXC_COMMON_BEGIN(performance_monitor_common)
bl performance_monitor_exception
b interrupt_return
GEN_KVM performance_monitor
/**
* Interrupt 0xf20 - Vector Unavailable Interrupt.
@ -2374,8 +2239,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
bl altivec_unavailable_exception
b interrupt_return
GEN_KVM altivec_unavailable
/**
* Interrupt 0xf40 - VSX Unavailable Interrupt.
@ -2424,8 +2287,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
bl vsx_unavailable_exception
b interrupt_return
GEN_KVM vsx_unavailable
/**
* Interrupt 0xf60 - Facility Unavailable Interrupt.
@ -2454,8 +2315,6 @@ EXC_COMMON_BEGIN(facility_unavailable_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
GEN_KVM facility_unavailable
/**
* Interrupt 0xf60 - Hypervisor Facility Unavailable Interrupt.
@ -2484,8 +2343,6 @@ EXC_COMMON_BEGIN(h_facility_unavailable_common)
REST_NVGPRS(r1) /* XXX Shouldn't be necessary in practice */
b interrupt_return
GEN_KVM h_facility_unavailable
EXC_REAL_NONE(0xfa0, 0x20)
EXC_VIRT_NONE(0x4fa0, 0x20)
@ -2515,8 +2372,6 @@ EXC_COMMON_BEGIN(cbe_system_error_common)
bl cbe_system_error_exception
b interrupt_return
GEN_KVM cbe_system_error
#else /* CONFIG_CBE_RAS */
EXC_REAL_NONE(0x1200, 0x100)
EXC_VIRT_NONE(0x5200, 0x100)
@ -2548,8 +2403,6 @@ EXC_COMMON_BEGIN(instruction_breakpoint_common)
bl instruction_breakpoint_exception
b interrupt_return
GEN_KVM instruction_breakpoint
EXC_REAL_NONE(0x1400, 0x100)
EXC_VIRT_NONE(0x5400, 0x100)
@ -2670,8 +2523,6 @@ EXC_COMMON_BEGIN(denorm_exception_common)
bl unknown_exception
b interrupt_return
GEN_KVM denorm_exception
#ifdef CONFIG_CBE_RAS
INT_DEFINE_BEGIN(cbe_maintenance)
@ -2689,8 +2540,6 @@ EXC_COMMON_BEGIN(cbe_maintenance_common)
bl cbe_maintenance_exception
b interrupt_return
GEN_KVM cbe_maintenance
#else /* CONFIG_CBE_RAS */
EXC_REAL_NONE(0x1600, 0x100)
EXC_VIRT_NONE(0x5600, 0x100)
@ -2721,8 +2570,6 @@ EXC_COMMON_BEGIN(altivec_assist_common)
#endif
b interrupt_return
GEN_KVM altivec_assist
#ifdef CONFIG_CBE_RAS
INT_DEFINE_BEGIN(cbe_thermal)
@ -2740,8 +2587,6 @@ EXC_COMMON_BEGIN(cbe_thermal_common)
bl cbe_thermal_exception
b interrupt_return
GEN_KVM cbe_thermal
#else /* CONFIG_CBE_RAS */
EXC_REAL_NONE(0x1800, 0x100)
EXC_VIRT_NONE(0x5800, 0x100)
@ -2994,6 +2839,15 @@ TRAMP_REAL_BEGIN(rfscv_flush_fallback)
USE_TEXT_SECTION()
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
kvm_interrupt:
/*
* The conditional branch in KVMTEST can't reach all the way,
* make a stub.
*/
b kvmppc_interrupt
#endif
_GLOBAL(do_uaccess_flush)
UACCESS_FLUSH_FIXUP_SECTION
nop
@ -3009,32 +2863,6 @@ EXPORT_SYMBOL(do_uaccess_flush)
MASKED_INTERRUPT
MASKED_INTERRUPT hsrr=1
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
kvmppc_skip_interrupt:
/*
* Here all GPRs are unchanged from when the interrupt happened
* except for r13, which is saved in SPRG_SCRATCH0.
*/
mfspr r13, SPRN_SRR0
addi r13, r13, 4
mtspr SPRN_SRR0, r13
GET_SCRATCH0(r13)
RFI_TO_KERNEL
b .
kvmppc_skip_Hinterrupt:
/*
* Here all GPRs are unchanged from when the interrupt happened
* except for r13, which is saved in SPRG_SCRATCH0.
*/
mfspr r13, SPRN_HSRR0
addi r13, r13, 4
mtspr SPRN_HSRR0, r13
GET_SCRATCH0(r13)
HRFI_TO_KERNEL
b .
#endif
/*
* Relocation-on interrupts: A subset of the interrupts can be delivered
* with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering

View File

@ -432,16 +432,19 @@ device_initcall(stf_barrier_debugfs_init);
static void update_branch_cache_flush(void)
{
u32 *site;
u32 *site, __maybe_unused *site2;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
site = &patch__call_kvm_flush_link_stack;
site2 = &patch__call_kvm_flush_link_stack_p9;
// This controls the branch from guest_exit_cont to kvm_flush_link_stack
if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) {
patch_instruction_site(site, ppc_inst(PPC_INST_NOP));
patch_instruction_site(site2, ppc_inst(PPC_INST_NOP));
} else {
// Could use HW flush, but that could also flush count cache
patch_branch_site(site, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
patch_branch_site(site2, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
}
#endif

View File

@ -508,16 +508,6 @@ EXPORT_SYMBOL(profile_pc);
* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
*/
#ifdef CONFIG_PPC64
static inline unsigned long test_irq_work_pending(void)
{
unsigned long x;
asm volatile("lbz %0,%1(13)"
: "=r" (x)
: "i" (offsetof(struct paca_struct, irq_work_pending)));
return x;
}
static inline void set_irq_work_pending_flag(void)
{
asm volatile("stb %0,%1(13)" : :

View File

@ -6,7 +6,7 @@
ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
KVM := ../../../virt/kvm
common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
@ -57,6 +57,7 @@ kvm-pr-y := \
book3s_32_mmu.o
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_64_entry.o \
tm.o
ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@ -86,6 +87,7 @@ kvm-book3s_64-builtin-tm-objs-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_hv_hmi.o \
book3s_hv_p9_entry.o \
book3s_hv_rmhandlers.o \
book3s_hv_rm_mmu.o \
book3s_hv_ras.o \

View File

@ -38,37 +38,66 @@
/* #define EXIT_DEBUG */
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("exits", sum_exits),
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
VCPU_STAT("sysc", syscall_exits),
VCPU_STAT("inst_emu", emulated_inst_exits),
VCPU_STAT("dec", dec_exits),
VCPU_STAT("ext_intr", ext_intr_exits),
VCPU_STAT("queue_intr", queue_intr),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VCPU_STAT("halt_wait_ns", halt_wait_ns),
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_successful_wait", halt_successful_wait),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("pf_storage", pf_storage),
VCPU_STAT("sp_storage", sp_storage),
VCPU_STAT("pf_instruc", pf_instruc),
VCPU_STAT("sp_instruc", sp_instruc),
VCPU_STAT("ld", ld),
VCPU_STAT("ld_slow", ld_slow),
VCPU_STAT("st", st),
VCPU_STAT("st_slow", st_slow),
VCPU_STAT("pthru_all", pthru_all),
VCPU_STAT("pthru_host", pthru_host),
VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
{ NULL }
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_ICOUNTER(VM, num_2M_pages),
STATS_DESC_ICOUNTER(VM, num_1G_pages)
};
static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vm_stats_desc),
};
const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, sum_exits),
STATS_DESC_COUNTER(VCPU, mmio_exits),
STATS_DESC_COUNTER(VCPU, signal_exits),
STATS_DESC_COUNTER(VCPU, light_exits),
STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
STATS_DESC_COUNTER(VCPU, syscall_exits),
STATS_DESC_COUNTER(VCPU, isi_exits),
STATS_DESC_COUNTER(VCPU, dsi_exits),
STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
STATS_DESC_COUNTER(VCPU, dec_exits),
STATS_DESC_COUNTER(VCPU, ext_intr_exits),
STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
STATS_DESC_COUNTER(VCPU, halt_successful_wait),
STATS_DESC_COUNTER(VCPU, dbell_exits),
STATS_DESC_COUNTER(VCPU, gdbell_exits),
STATS_DESC_COUNTER(VCPU, ld),
STATS_DESC_COUNTER(VCPU, st),
STATS_DESC_COUNTER(VCPU, pf_storage),
STATS_DESC_COUNTER(VCPU, pf_instruc),
STATS_DESC_COUNTER(VCPU, sp_storage),
STATS_DESC_COUNTER(VCPU, sp_instruc),
STATS_DESC_COUNTER(VCPU, queue_intr),
STATS_DESC_COUNTER(VCPU, ld_slow),
STATS_DESC_COUNTER(VCPU, st_slow),
STATS_DESC_COUNTER(VCPU, pthru_all),
STATS_DESC_COUNTER(VCPU, pthru_host),
STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
};
static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vcpu_stats_desc),
};
static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
@ -171,6 +200,12 @@ void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags)
}
EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check);
void kvmppc_core_queue_syscall(struct kvm_vcpu *vcpu)
{
kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_SYSCALL, 0);
}
EXPORT_SYMBOL(kvmppc_core_queue_syscall);
void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
{
/* might as well deliver this straight away */
@ -1044,13 +1079,10 @@ static int kvmppc_book3s_init(void)
#ifdef CONFIG_KVM_XICS
#ifdef CONFIG_KVM_XIVE
if (xics_on_xive()) {
kvmppc_xive_init_module();
kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
if (kvmppc_xive_native_supported()) {
kvmppc_xive_native_init_module();
if (kvmppc_xive_native_supported())
kvm_register_device_ops(&kvm_xive_native_ops,
KVM_DEV_TYPE_XIVE);
}
} else
#endif
kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
@ -1060,12 +1092,6 @@ static int kvmppc_book3s_init(void)
static void kvmppc_book3s_exit(void)
{
#ifdef CONFIG_KVM_XICS
if (xics_on_xive()) {
kvmppc_xive_exit_module();
kvmppc_xive_native_exit_module();
}
#endif
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
kvmppc_book3s_exit_pr();
#endif

View File

@ -0,0 +1,416 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <asm/asm-offsets.h>
#include <asm/cache.h>
#include <asm/code-patching-asm.h>
#include <asm/exception-64s.h>
#include <asm/export.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_book3s_asm.h>
#include <asm/mmu.h>
#include <asm/ppc_asm.h>
#include <asm/ptrace.h>
#include <asm/reg.h>
#include <asm/ultravisor-api.h>
/*
* These are branched to from interrupt handlers in exception-64s.S which set
* IKVM_REAL or IKVM_VIRT, if HSTATE_IN_GUEST was found to be non-zero.
*/
/*
* This is a hcall, so register convention is as
* Documentation/powerpc/papr_hcalls.rst.
*
* This may also be a syscall from PR-KVM userspace that is to be
* reflected to the PR guest kernel, so registers may be set up for
* a system call rather than hcall. We don't currently clobber
* anything here, but the 0xc00 handler has already clobbered CTR
* and CR0, so PR-KVM can not support a guest kernel that preserves
* those registers across its system calls.
*
* The state of registers is as kvmppc_interrupt, except CFAR is not
* saved, R13 is not in SCRATCH0, and R10 does not contain the trap.
*/
.global kvmppc_hcall
.balign IFETCH_ALIGN_BYTES
kvmppc_hcall:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
lbz r10,HSTATE_IN_GUEST(r13)
cmpwi r10,KVM_GUEST_MODE_HV_P9
beq kvmppc_p9_exit_hcall
#endif
ld r10,PACA_EXGEN+EX_R13(r13)
SET_SCRATCH0(r10)
li r10,0xc00
/* Now we look like kvmppc_interrupt */
li r11,PACA_EXGEN
b .Lgot_save_area
/*
* KVM interrupt entry occurs after GEN_INT_ENTRY runs, and follows that
* call convention:
*
* guest R9-R13, CTR, CFAR, PPR saved in PACA EX_xxx save area
* guest (H)DAR, (H)DSISR are also in the save area for relevant interrupts
* guest R13 also saved in SCRATCH0
* R13 = PACA
* R11 = (H)SRR0
* R12 = (H)SRR1
* R9 = guest CR
* PPR is set to medium
*
* With the addition for KVM:
* R10 = trap vector
*/
.global kvmppc_interrupt
.balign IFETCH_ALIGN_BYTES
kvmppc_interrupt:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
std r10,HSTATE_SCRATCH0(r13)
lbz r10,HSTATE_IN_GUEST(r13)
cmpwi r10,KVM_GUEST_MODE_HV_P9
beq kvmppc_p9_exit_interrupt
ld r10,HSTATE_SCRATCH0(r13)
#endif
li r11,PACA_EXGEN
cmpdi r10,0x200
bgt+ .Lgot_save_area
li r11,PACA_EXMC
beq .Lgot_save_area
li r11,PACA_EXNMI
.Lgot_save_area:
add r11,r11,r13
BEGIN_FTR_SECTION
ld r12,EX_CFAR(r11)
std r12,HSTATE_CFAR(r13)
END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
ld r12,EX_CTR(r11)
mtctr r12
BEGIN_FTR_SECTION
ld r12,EX_PPR(r11)
std r12,HSTATE_PPR(r13)
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r12,EX_R12(r11)
std r12,HSTATE_SCRATCH0(r13)
sldi r12,r9,32
or r12,r12,r10
ld r9,EX_R9(r11)
ld r10,EX_R10(r11)
ld r11,EX_R11(r11)
/*
* Hcalls and other interrupts come here after normalising register
* contents and save locations:
*
* R12 = (guest CR << 32) | interrupt vector
* R13 = PACA
* guest R12 saved in shadow HSTATE_SCRATCH0
* guest R13 saved in SPRN_SCRATCH0
*/
std r9,HSTATE_SCRATCH2(r13)
lbz r9,HSTATE_IN_GUEST(r13)
cmpwi r9,KVM_GUEST_MODE_SKIP
beq- .Lmaybe_skip
.Lno_skip:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
cmpwi r9,KVM_GUEST_MODE_GUEST
beq kvmppc_interrupt_pr
#endif
b kvmppc_interrupt_hv
#else
b kvmppc_interrupt_pr
#endif
/*
* "Skip" interrupts are part of a trick KVM uses a with hash guests to load
* the faulting instruction in guest memory from the the hypervisor without
* walking page tables.
*
* When the guest takes a fault that requires the hypervisor to load the
* instruction (e.g., MMIO emulation), KVM is running in real-mode with HV=1
* and the guest MMU context loaded. It sets KVM_GUEST_MODE_SKIP, and sets
* MSR[DR]=1 while leaving MSR[IR]=0, so it continues to fetch HV instructions
* but loads and stores will access the guest context. This is used to load
* the faulting instruction using the faulting guest effective address.
*
* However the guest context may not be able to translate, or it may cause a
* machine check or other issue, which results in a fault in the host
* (even with KVM-HV).
*
* These faults come here because KVM_GUEST_MODE_SKIP was set, so if they
* are (or are likely) caused by that load, the instruction is skipped by
* just returning with the PC advanced +4, where it is noticed the load did
* not execute and it goes to the slow path which walks the page tables to
* read guest memory.
*/
.Lmaybe_skip:
cmpwi r12,BOOK3S_INTERRUPT_MACHINE_CHECK
beq 1f
cmpwi r12,BOOK3S_INTERRUPT_DATA_STORAGE
beq 1f
cmpwi r12,BOOK3S_INTERRUPT_DATA_SEGMENT
beq 1f
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/* HSRR interrupts get 2 added to interrupt number */
cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE | 0x2
beq 2f
#endif
b .Lno_skip
1: mfspr r9,SPRN_SRR0
addi r9,r9,4
mtspr SPRN_SRR0,r9
ld r12,HSTATE_SCRATCH0(r13)
ld r9,HSTATE_SCRATCH2(r13)
GET_SCRATCH0(r13)
RFI_TO_KERNEL
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
2: mfspr r9,SPRN_HSRR0
addi r9,r9,4
mtspr SPRN_HSRR0,r9
ld r12,HSTATE_SCRATCH0(r13)
ld r9,HSTATE_SCRATCH2(r13)
GET_SCRATCH0(r13)
HRFI_TO_KERNEL
#endif
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/* Stack frame offsets for kvmppc_p9_enter_guest */
#define SFS (144 + STACK_FRAME_MIN_SIZE)
#define STACK_SLOT_NVGPRS (SFS - 144) /* 18 gprs */
/*
* void kvmppc_p9_enter_guest(struct vcpu *vcpu);
*
* Enter the guest on a ISAv3.0 or later system.
*/
.balign IFETCH_ALIGN_BYTES
_GLOBAL(kvmppc_p9_enter_guest)
EXPORT_SYMBOL_GPL(kvmppc_p9_enter_guest)
mflr r0
std r0,PPC_LR_STKOFF(r1)
stdu r1,-SFS(r1)
std r1,HSTATE_HOST_R1(r13)
mfcr r4
stw r4,SFS+8(r1)
reg = 14
.rept 18
std reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
reg = reg + 1
.endr
ld r4,VCPU_LR(r3)
mtlr r4
ld r4,VCPU_CTR(r3)
mtctr r4
ld r4,VCPU_XER(r3)
mtspr SPRN_XER,r4
ld r1,VCPU_CR(r3)
BEGIN_FTR_SECTION
ld r4,VCPU_CFAR(r3)
mtspr SPRN_CFAR,r4
END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
BEGIN_FTR_SECTION
ld r4,VCPU_PPR(r3)
mtspr SPRN_PPR,r4
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
reg = 4
.rept 28
ld reg,__VCPU_GPR(reg)(r3)
reg = reg + 1
.endr
ld r4,VCPU_KVM(r3)
lbz r4,KVM_SECURE_GUEST(r4)
cmpdi r4,0
ld r4,VCPU_GPR(R4)(r3)
bne .Lret_to_ultra
mtcr r1
ld r0,VCPU_GPR(R0)(r3)
ld r1,VCPU_GPR(R1)(r3)
ld r2,VCPU_GPR(R2)(r3)
ld r3,VCPU_GPR(R3)(r3)
HRFI_TO_GUEST
b .
/*
* Use UV_RETURN ultracall to return control back to the Ultravisor
* after processing an hypercall or interrupt that was forwarded
* (a.k.a. reflected) to the Hypervisor.
*
* All registers have already been reloaded except the ucall requires:
* R0 = hcall result
* R2 = SRR1, so UV can detect a synthesized interrupt (if any)
* R3 = UV_RETURN
*/
.Lret_to_ultra:
mtcr r1
ld r1,VCPU_GPR(R1)(r3)
ld r0,VCPU_GPR(R3)(r3)
mfspr r2,SPRN_SRR1
LOAD_REG_IMMEDIATE(r3, UV_RETURN)
sc 2
/*
* kvmppc_p9_exit_hcall and kvmppc_p9_exit_interrupt are branched to from
* above if the interrupt was taken for a guest that was entered via
* kvmppc_p9_enter_guest().
*
* The exit code recovers the host stack and vcpu pointer, saves all guest GPRs
* and CR, LR, XER as well as guest MSR and NIA into the VCPU, then re-
* establishes the host stack and registers to return from the
* kvmppc_p9_enter_guest() function, which saves CTR and other guest registers
* (SPRs and FP, VEC, etc).
*/
.balign IFETCH_ALIGN_BYTES
kvmppc_p9_exit_hcall:
mfspr r11,SPRN_SRR0
mfspr r12,SPRN_SRR1
li r10,0xc00
std r10,HSTATE_SCRATCH0(r13)
.balign IFETCH_ALIGN_BYTES
kvmppc_p9_exit_interrupt:
/*
* If set to KVM_GUEST_MODE_HV_P9 but we're still in the
* hypervisor, that means we can't return from the entry stack.
*/
rldicl. r10,r12,64-MSR_HV_LG,63
bne- kvmppc_p9_bad_interrupt
std r1,HSTATE_SCRATCH1(r13)
std r3,HSTATE_SCRATCH2(r13)
ld r1,HSTATE_HOST_R1(r13)
ld r3,HSTATE_KVM_VCPU(r13)
std r9,VCPU_CR(r3)
1:
std r11,VCPU_PC(r3)
std r12,VCPU_MSR(r3)
reg = 14
.rept 18
std reg,__VCPU_GPR(reg)(r3)
reg = reg + 1
.endr
/* r1, r3, r9-r13 are saved to vcpu by C code */
std r0,VCPU_GPR(R0)(r3)
std r2,VCPU_GPR(R2)(r3)
reg = 4
.rept 5
std reg,__VCPU_GPR(reg)(r3)
reg = reg + 1
.endr
ld r2,PACATOC(r13)
mflr r4
std r4,VCPU_LR(r3)
mfspr r4,SPRN_XER
std r4,VCPU_XER(r3)
reg = 14
.rept 18
ld reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
reg = reg + 1
.endr
lwz r4,SFS+8(r1)
mtcr r4
/*
* Flush the link stack here, before executing the first blr on the
* way out of the guest.
*
* The link stack won't match coming out of the guest anyway so the
* only cost is the flush itself. The call clobbers r0.
*/
1: nop
patch_site 1b patch__call_kvm_flush_link_stack_p9
addi r1,r1,SFS
ld r0,PPC_LR_STKOFF(r1)
mtlr r0
blr
/*
* Took an interrupt somewhere right before HRFID to guest, so registers are
* in a bad way. Return things hopefully enough to run host virtual code and
* run the Linux interrupt handler (SRESET or MCE) to print something useful.
*
* We could be really clever and save all host registers in known locations
* before setting HSTATE_IN_GUEST, then restoring them all here, and setting
* return address to a fixup that sets them up again. But that's a lot of
* effort for a small bit of code. Lots of other things to do first.
*/
kvmppc_p9_bad_interrupt:
BEGIN_MMU_FTR_SECTION
/*
* Hash host doesn't try to recover MMU (requires host SLB reload)
*/
b .
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
/*
* Clean up guest registers to give host a chance to run.
*/
li r10,0
mtspr SPRN_AMR,r10
mtspr SPRN_IAMR,r10
mtspr SPRN_CIABR,r10
mtspr SPRN_DAWRX0,r10
BEGIN_FTR_SECTION
mtspr SPRN_DAWRX1,r10
END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
mtspr SPRN_PID,r10
/*
* Switch to host MMU mode
*/
ld r10, HSTATE_KVM_VCPU(r13)
ld r10, VCPU_KVM(r10)
lwz r10, KVM_HOST_LPID(r10)
mtspr SPRN_LPID,r10
ld r10, HSTATE_KVM_VCPU(r13)
ld r10, VCPU_KVM(r10)
ld r10, KVM_HOST_LPCR(r10)
mtspr SPRN_LPCR,r10
/*
* Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
* MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
*/
li r10,KVM_GUEST_MODE_NONE
stb r10,HSTATE_IN_GUEST(r13)
li r10,MSR_RI
andc r12,r12,r10
/*
* Go back to interrupt handler. MCE and SRESET have their specific
* PACA save area so they should be used directly. They set up their
* own stack. The other handlers all use EXGEN. They will use the
* guest r1 if it looks like a kernel stack, so just load the
* emergency stack and go to program check for all other interrupts.
*/
ld r10,HSTATE_SCRATCH0(r13)
cmpwi r10,BOOK3S_INTERRUPT_MACHINE_CHECK
beq machine_check_common
cmpwi r10,BOOK3S_INTERRUPT_SYSTEM_RESET
beq system_reset_common
b .
#endif

View File

@ -21,6 +21,7 @@
#include <asm/pte-walk.h>
#include <asm/ultravisor.h>
#include <asm/kvm_book3s_uvmem.h>
#include <asm/plpar_wrappers.h>
/*
* Supported radix tree geometry.
@ -318,9 +319,19 @@ void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
}
psi = shift_to_mmu_psize(pshift);
rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
lpid, rb);
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
lpid, rb);
} else {
rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
H_RPTI_TYPE_NESTED |
H_RPTI_TYPE_TLB,
psize_to_rpti_pgsize(psi),
addr, addr + psize);
}
if (rc)
pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
}
@ -334,8 +345,14 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
return;
}
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
else
rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
H_RPTI_TYPE_NESTED |
H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
0, -1UL);
if (rc)
pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
}

View File

@ -391,10 +391,6 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
/* liobn, ioba, tce); */
/* For radix, we might be in virtual mode, so punt */
if (kvm_is_radix(vcpu->kvm))
return H_TOO_HARD;
stt = kvmppc_find_table(vcpu->kvm, liobn);
if (!stt)
return H_TOO_HARD;
@ -489,10 +485,6 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
bool prereg = false;
struct kvmppc_spapr_tce_iommu_table *stit;
/* For radix, we might be in virtual mode, so punt */
if (kvm_is_radix(vcpu->kvm))
return H_TOO_HARD;
/*
* used to check for invalidations in progress
*/
@ -602,10 +594,6 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
long i, ret;
struct kvmppc_spapr_tce_iommu_table *stit;
/* For radix, we might be in virtual mode, so punt */
if (kvm_is_radix(vcpu->kvm))
return H_TOO_HARD;
stt = kvmppc_find_table(vcpu->kvm, liobn);
if (!stt)
return H_TOO_HARD;

File diff suppressed because it is too large Load Diff

View File

@ -34,21 +34,6 @@
#include "book3s_xics.h"
#include "book3s_xive.h"
/*
* The XIVE module will populate these when it loads
*/
unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr);
int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
/*
* Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
* should be power of 2.
@ -196,16 +181,9 @@ int kvmppc_hwrng_present(void)
}
EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
long kvmppc_h_random(struct kvm_vcpu *vcpu)
long kvmppc_rm_h_random(struct kvm_vcpu *vcpu)
{
int r;
/* Only need to do the expensive mfmsr() on radix */
if (kvm_is_radix(vcpu->kvm) && (mfmsr() & MSR_IR))
r = powernv_get_random_long(&vcpu->arch.regs.gpr[4]);
else
r = powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4]);
if (r)
if (powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4]))
return H_SUCCESS;
return H_HARDWARE;
@ -221,15 +199,6 @@ void kvmhv_rm_send_ipi(int cpu)
void __iomem *xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* For a nested hypervisor, use the XICS via hcall */
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
IPI_PRIORITY);
return;
}
/* On POWER9 we can use msgsnd for any destination cpu. */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
@ -442,19 +411,12 @@ static long kvmppc_read_one_intr(bool *again)
return 1;
/* Now read the interrupt from the ICP */
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
xirr = cpu_to_be32(retbuf[0]);
} else {
xics_phys = local_paca->kvm_hstate.xics_phys;
rc = 0;
if (!xics_phys)
rc = opal_int_get_xirr(&xirr, false);
else
xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
}
xics_phys = local_paca->kvm_hstate.xics_phys;
rc = 0;
if (!xics_phys)
rc = opal_int_get_xirr(&xirr, false);
else
xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
if (rc < 0)
return 1;
@ -483,13 +445,7 @@ static long kvmppc_read_one_intr(bool *again)
*/
if (xisr == XICS_IPI) {
rc = 0;
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf,
hard_smp_processor_id(), 0xff);
plpar_hcall_raw(H_EOI, retbuf, h_xirr);
} else if (xics_phys) {
if (xics_phys) {
__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else {
@ -515,13 +471,7 @@ static long kvmppc_read_one_intr(bool *again)
/* We raced with the host,
* we need to resend that IPI, bummer
*/
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf,
hard_smp_processor_id(),
IPI_PRIORITY);
} else if (xics_phys)
if (xics_phys)
__raw_rm_writeb(IPI_PRIORITY,
xics_phys + XICS_MFRR);
else
@ -541,22 +491,13 @@ static long kvmppc_read_one_intr(bool *again)
}
#ifdef CONFIG_KVM_XICS
static inline bool is_rm(void)
{
return !(mfmsr() & MSR_DR);
}
unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
if (unlikely(!__xive_vm_h_xirr))
return H_NOT_AVAILABLE;
return __xive_vm_h_xirr(vcpu);
} else
if (xics_on_xive())
return xive_rm_h_xirr(vcpu);
else
return xics_rm_h_xirr(vcpu);
}
@ -565,13 +506,9 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
vcpu->arch.regs.gpr[5] = get_tb();
if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
if (unlikely(!__xive_vm_h_xirr))
return H_NOT_AVAILABLE;
return __xive_vm_h_xirr(vcpu);
} else
if (xics_on_xive())
return xive_rm_h_xirr(vcpu);
else
return xics_rm_h_xirr(vcpu);
}
@ -579,13 +516,9 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_ipoll(vcpu, server);
if (unlikely(!__xive_vm_h_ipoll))
return H_NOT_AVAILABLE;
return __xive_vm_h_ipoll(vcpu, server);
} else
if (xics_on_xive())
return xive_rm_h_ipoll(vcpu, server);
else
return H_TOO_HARD;
}
@ -594,13 +527,9 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_ipi(vcpu, server, mfrr);
if (unlikely(!__xive_vm_h_ipi))
return H_NOT_AVAILABLE;
return __xive_vm_h_ipi(vcpu, server, mfrr);
} else
if (xics_on_xive())
return xive_rm_h_ipi(vcpu, server, mfrr);
else
return xics_rm_h_ipi(vcpu, server, mfrr);
}
@ -608,13 +537,9 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_cppr(vcpu, cppr);
if (unlikely(!__xive_vm_h_cppr))
return H_NOT_AVAILABLE;
return __xive_vm_h_cppr(vcpu, cppr);
} else
if (xics_on_xive())
return xive_rm_h_cppr(vcpu, cppr);
else
return xics_rm_h_cppr(vcpu, cppr);
}
@ -622,13 +547,9 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_eoi(vcpu, xirr);
if (unlikely(!__xive_vm_h_eoi))
return H_NOT_AVAILABLE;
return __xive_vm_h_eoi(vcpu, xirr);
} else
if (xics_on_xive())
return xive_rm_h_eoi(vcpu, xirr);
else
return xics_rm_h_eoi(vcpu, xirr);
}
#endif /* CONFIG_KVM_XICS */
@ -800,7 +721,7 @@ void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
* Thus we make all 4 threads use the same bit.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
pcpu = cpu_first_thread_sibling(pcpu);
pcpu = cpu_first_tlb_thread_sibling(pcpu);
if (nested)
need_tlb_flush = &nested->need_tlb_flush;

View File

@ -58,7 +58,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/*
* Put whatever is in the decrementer into the
* hypervisor decrementer.
* Because of a hardware deviation in P8 and P9,
* Because of a hardware deviation in P8,
* we need to set LPCR[HDICE] before writing HDEC.
*/
ld r5, HSTATE_KVM_VCORE(r13)
@ -67,15 +67,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
ori r8, r9, LPCR_HDICE
mtspr SPRN_LPCR, r8
isync
andis. r0, r9, LPCR_LD@h
mfspr r8,SPRN_DEC
mftb r7
BEGIN_FTR_SECTION
/* On POWER9, don't sign-extend if host LPCR[LD] bit is set */
bne 32f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r8,r8
32: mtspr SPRN_HDEC,r8
mtspr SPRN_HDEC,r8
add r8,r8,r7
std r8,HSTATE_DECEXP(r13)

View File

@ -19,6 +19,7 @@
#include <asm/pgalloc.h>
#include <asm/pte-walk.h>
#include <asm/reg.h>
#include <asm/plpar_wrappers.h>
static struct patb_entry *pseries_partition_tb;
@ -53,7 +54,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
hr->dawrx1 = vcpu->arch.dawrx1;
}
static void byteswap_pt_regs(struct pt_regs *regs)
/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */
static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs)
{
unsigned long *addr = (unsigned long *) regs;
@ -467,8 +469,15 @@ static void kvmhv_flush_lpid(unsigned int lpid)
return;
}
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
else
rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
H_RPTI_TYPE_NESTED |
H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
H_RPTI_TYPE_PAT,
H_RPTI_PAGE_ALL, 0, -1UL);
if (rc)
pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
}
@ -1214,6 +1223,113 @@ long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
return H_SUCCESS;
}
static long do_tlb_invalidate_nested_all(struct kvm_vcpu *vcpu,
unsigned long lpid, unsigned long ric)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
gp = kvmhv_get_nested(kvm, lpid, false);
if (gp) {
kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
kvmhv_put_nested(gp);
}
return H_SUCCESS;
}
/*
* Number of pages above which we invalidate the entire LPID rather than
* flush individual pages.
*/
static unsigned long tlb_range_flush_page_ceiling __read_mostly = 33;
static long do_tlb_invalidate_nested_tlb(struct kvm_vcpu *vcpu,
unsigned long lpid,
unsigned long pg_sizes,
unsigned long start,
unsigned long end)
{
int ret = H_P4;
unsigned long addr, nr_pages;
struct mmu_psize_def *def;
unsigned long psize, ap, page_size;
bool flush_lpid;
for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
def = &mmu_psize_defs[psize];
if (!(pg_sizes & def->h_rpt_pgsize))
continue;
nr_pages = (end - start) >> def->shift;
flush_lpid = nr_pages > tlb_range_flush_page_ceiling;
if (flush_lpid)
return do_tlb_invalidate_nested_all(vcpu, lpid,
RIC_FLUSH_TLB);
addr = start;
ap = mmu_get_ap(psize);
page_size = 1UL << def->shift;
do {
ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap,
get_epn(addr));
if (ret)
return H_P4;
addr += page_size;
} while (addr < end);
}
return ret;
}
/*
* Performs partition-scoped invalidations for nested guests
* as part of H_RPT_INVALIDATE hcall.
*/
long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid,
unsigned long type, unsigned long pg_sizes,
unsigned long start, unsigned long end)
{
/*
* If L2 lpid isn't valid, we need to return H_PARAMETER.
*
* However, nested KVM issues a L2 lpid flush call when creating
* partition table entries for L2. This happens even before the
* corresponding shadow lpid is created in HV which happens in
* H_ENTER_NESTED call. Since we can't differentiate this case from
* the invalid case, we ignore such flush requests and return success.
*/
if (!kvmhv_find_nested(vcpu->kvm, lpid))
return H_SUCCESS;
/*
* A flush all request can be handled by a full lpid flush only.
*/
if ((type & H_RPTI_TYPE_NESTED_ALL) == H_RPTI_TYPE_NESTED_ALL)
return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_ALL);
/*
* We don't need to handle a PWC flush like process table here,
* because intermediate partition scoped table in nested guest doesn't
* really have PWC. Only level we have PWC is in L0 and for nested
* invalidate at L0 we always do kvm_flush_lpid() which does
* radix__flush_all_lpid(). For range invalidate at any level, we
* are not removing the higher level page tables and hence there is
* no PWC invalidate needed.
*
* if (type & H_RPTI_TYPE_PWC) {
* ret = do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_PWC);
* if (ret)
* return H_P4;
* }
*/
if (start == 0 && end == -1)
return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_TLB);
if (type & H_RPTI_TYPE_TLB)
return do_tlb_invalidate_nested_tlb(vcpu, lpid, pg_sizes,
start, end);
return H_SUCCESS;
}
/* Used to convert a nested guest real address to a L1 guest real address */
static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,

View File

@ -0,0 +1,508 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <asm/asm-prototypes.h>
#include <asm/dbell.h>
#include <asm/kvm_ppc.h>
#include <asm/ppc-opcode.h>
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
static void __start_timing(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 tb = mftb() - vc->tb_offset_applied;
vcpu->arch.cur_activity = next;
vcpu->arch.cur_tb_start = tb;
}
static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
struct kvmhv_tb_accumulator *curr;
u64 tb = mftb() - vc->tb_offset_applied;
u64 prev_tb;
u64 delta;
u64 seq;
curr = vcpu->arch.cur_activity;
vcpu->arch.cur_activity = next;
prev_tb = vcpu->arch.cur_tb_start;
vcpu->arch.cur_tb_start = tb;
if (!curr)
return;
delta = tb - prev_tb;
seq = curr->seqcount;
curr->seqcount = seq + 1;
smp_wmb();
curr->tb_total += delta;
if (seq == 0 || delta < curr->tb_min)
curr->tb_min = delta;
if (delta > curr->tb_max)
curr->tb_max = delta;
smp_wmb();
curr->seqcount = seq + 2;
}
#define start_timing(vcpu, next) __start_timing(vcpu, next)
#define end_timing(vcpu) __start_timing(vcpu, NULL)
#define accumulate_time(vcpu, next) __accumulate_time(vcpu, next)
#else
#define start_timing(vcpu, next) do {} while (0)
#define end_timing(vcpu) do {} while (0)
#define accumulate_time(vcpu, next) do {} while (0)
#endif
static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev)
{
asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx));
asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx));
}
static inline void mtslb(u64 slbee, u64 slbev)
{
asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
}
static inline void clear_slb_entry(unsigned int idx)
{
mtslb(idx, 0);
}
static inline void slb_clear_invalidate_partition(void)
{
clear_slb_entry(0);
asm volatile(PPC_SLBIA(6));
}
/*
* Malicious or buggy radix guests may have inserted SLB entries
* (only 0..3 because radix always runs with UPRT=1), so these must
* be cleared here to avoid side-channels. slbmte is used rather
* than slbia, as it won't clear cached translations.
*/
static void radix_clear_slb(void)
{
int i;
for (i = 0; i < 4; i++)
clear_slb_entry(i);
}
static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
{
struct kvm_nested_guest *nested = vcpu->arch.nested;
u32 lpid;
lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
/*
* All the isync()s are overkill but trivially follow the ISA
* requirements. Some can likely be replaced with justification
* comment for why they are not needed.
*/
isync();
mtspr(SPRN_LPID, lpid);
isync();
mtspr(SPRN_LPCR, lpcr);
isync();
mtspr(SPRN_PID, vcpu->arch.pid);
isync();
}
static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
{
u32 lpid;
int i;
lpid = kvm->arch.lpid;
mtspr(SPRN_LPID, lpid);
mtspr(SPRN_LPCR, lpcr);
mtspr(SPRN_PID, vcpu->arch.pid);
for (i = 0; i < vcpu->arch.slb_max; i++)
mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv);
isync();
}
static void switch_mmu_to_host(struct kvm *kvm, u32 pid)
{
isync();
mtspr(SPRN_PID, pid);
isync();
mtspr(SPRN_LPID, kvm->arch.host_lpid);
isync();
mtspr(SPRN_LPCR, kvm->arch.host_lpcr);
isync();
if (!radix_enabled())
slb_restore_bolted_realmode();
}
static void save_clear_host_mmu(struct kvm *kvm)
{
if (!radix_enabled()) {
/*
* Hash host could save and restore host SLB entries to
* reduce SLB fault overheads of VM exits, but for now the
* existing code clears all entries and restores just the
* bolted ones when switching back to host.
*/
slb_clear_invalidate_partition();
}
}
static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (kvm_is_radix(kvm)) {
radix_clear_slb();
} else {
int i;
int nr = 0;
/*
* This must run before switching to host (radix host can't
* access all SLBs).
*/
for (i = 0; i < vcpu->arch.slb_nr; i++) {
u64 slbee, slbev;
mfslb(i, &slbee, &slbev);
if (slbee & SLB_ESID_V) {
vcpu->arch.slb[nr].orige = slbee | i;
vcpu->arch.slb[nr].origv = slbev;
nr++;
}
}
vcpu->arch.slb_max = nr;
slb_clear_invalidate_partition();
}
}
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *nested = vcpu->arch.nested;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
s64 hdec;
u64 tb, purr, spurr;
u64 *exsave;
bool ri_set;
int trap;
unsigned long msr;
unsigned long host_hfscr;
unsigned long host_ciabr;
unsigned long host_dawr0;
unsigned long host_dawrx0;
unsigned long host_psscr;
unsigned long host_pidr;
unsigned long host_dawr1;
unsigned long host_dawrx1;
hdec = time_limit - mftb();
if (hdec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV);
WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME));
start_timing(vcpu, &vcpu->arch.rm_entry);
vcpu->arch.ceded = 0;
if (vc->tb_offset) {
u64 new_tb = mftb() + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = vc->tb_offset;
}
msr = mfmsr();
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
host_dawr0 = mfspr(SPRN_DAWR0);
host_dawrx0 = mfspr(SPRN_DAWRX0);
host_psscr = mfspr(SPRN_PSSCR);
host_pidr = mfspr(SPRN_PID);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
host_dawr1 = mfspr(SPRN_DAWR1);
host_dawrx1 = mfspr(SPRN_DAWRX1);
}
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
mtspr(SPRN_DPDES, vc->dpdes);
mtspr(SPRN_VTB, vc->vtb);
local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, vcpu->arch.purr);
mtspr(SPRN_SPURR, vcpu->arch.spurr);
if (dawr_enabled()) {
mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
}
}
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
mtspr(SPRN_IC, vcpu->arch.ic);
mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME);
/*
* On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage
* Interrupt (HDSI) the HDSISR is not be updated at all.
*
* To work around this we put a canary value into the HDSISR before
* returning to a guest and then check for this canary when we take a
* HDSI. If we find the canary on a HDSI, we know the hardware didn't
* update the HDSISR. In this case we return to the guest to retake the
* HDSI which should correctly update the HDSISR the second time HDSI
* entry.
*
* Just do this on all p9 processors for now.
*/
mtspr(SPRN_HDSISR, HDSISR_CANARY);
mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
mtspr(SPRN_AMOR, ~0UL);
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9;
/*
* Hash host, hash guest, or radix guest with prefetch bug, all have
* to disable the MMU before switching to guest MMU state.
*/
if (!radix_enabled() || !kvm_is_radix(kvm) ||
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
__mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0);
save_clear_host_mmu(kvm);
if (kvm_is_radix(kvm)) {
switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
__mtmsrd(0, 1); /* clear RI */
} else {
switch_mmu_to_guest_hpt(kvm, vcpu, lpcr);
}
/* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */
kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested);
/*
* P9 suppresses the HDEC exception when LPCR[HDICE] = 0,
* so set guest LPCR (with HDICE) before writing HDEC.
*/
mtspr(SPRN_HDEC, hdec);
mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
accumulate_time(vcpu, &vcpu->arch.guest_time);
kvmppc_p9_enter_guest(vcpu);
accumulate_time(vcpu, &vcpu->arch.rm_intr);
/* XXX: Could get these from r11/12 and paca exsave instead */
vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0);
vcpu->arch.shregs.srr1 = mfspr(SPRN_SRR1);
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
/* 0x2 bit for HSRR is only used by PR and P7/8 HV paths, clear it */
trap = local_paca->kvm_hstate.scratch0 & ~0x2;
/* HSRR interrupts leave MSR[RI] unchanged, SRR interrupts clear it. */
ri_set = false;
if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) {
if (trap != BOOK3S_INTERRUPT_SYSCALL &&
(vcpu->arch.shregs.msr & MSR_RI))
ri_set = true;
exsave = local_paca->exgen;
} else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) {
exsave = local_paca->exnmi;
} else { /* trap == 0x200 */
exsave = local_paca->exmc;
}
vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1;
vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
/*
* Only set RI after reading machine check regs (DAR, DSISR, SRR0/1)
* and hstate scratch (which we need to move into exsave to make
* re-entrant vs SRESET/MCE)
*/
if (ri_set) {
if (unlikely(!(mfmsr() & MSR_RI))) {
__mtmsrd(MSR_RI, 1);
WARN_ON_ONCE(1);
}
} else {
WARN_ON_ONCE(mfmsr() & MSR_RI);
__mtmsrd(MSR_RI, 1);
}
vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)];
vcpu->arch.regs.gpr[12] = exsave[EX_R12/sizeof(u64)];
vcpu->arch.regs.gpr[13] = exsave[EX_R13/sizeof(u64)];
vcpu->arch.ppr = exsave[EX_PPR/sizeof(u64)];
vcpu->arch.cfar = exsave[EX_CFAR/sizeof(u64)];
vcpu->arch.regs.ctr = exsave[EX_CTR/sizeof(u64)];
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
if (unlikely(trap == BOOK3S_INTERRUPT_MACHINE_CHECK)) {
vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
kvmppc_realmode_machine_check(vcpu);
} else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
kvmppc_realmode_hmi_handler();
} else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
} else if (trap == BOOK3S_INTERRUPT_H_DATA_STORAGE) {
vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
} else if (trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
} else if (trap == BOOK3S_INTERRUPT_H_FAC_UNAVAIL) {
vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Softpatch interrupt for transactional memory emulation cases
* on POWER9 DD2.2. This is early in the guest exit path - we
* haven't saved registers or done a treclaim yet.
*/
} else if (trap == BOOK3S_INTERRUPT_HV_SOFTPATCH) {
vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
/*
* The cases we want to handle here are those where the guest
* is in real suspend mode and is trying to transition to
* transactional mode.
*/
if (local_paca->kvm_hstate.fake_suspend &&
(vcpu->arch.shregs.msr & MSR_TS_S)) {
if (kvmhv_p9_tm_emulation_early(vcpu)) {
/* Prevent it being handled again. */
trap = 0;
}
}
#endif
}
accumulate_time(vcpu, &vcpu->arch.rm_exit);
/* Advance host PURR/SPURR by the amount used by guest */
purr = mfspr(SPRN_PURR);
spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
purr - vcpu->arch.purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
spurr - vcpu->arch.spurr);
vcpu->arch.purr = purr;
vcpu->arch.spurr = spurr;
vcpu->arch.ic = mfspr(SPRN_IC);
vcpu->arch.pid = mfspr(SPRN_PID);
vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
/* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
mtspr(SPRN_PSSCR, host_psscr |
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
mtspr(SPRN_DAWR0, host_dawr0);
mtspr(SPRN_DAWRX0, host_dawrx0);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
mtspr(SPRN_DAWR1, host_dawr1);
mtspr(SPRN_DAWRX1, host_dawrx1);
}
if (kvm_is_radix(kvm)) {
/*
* Since this is radix, do a eieio; tlbsync; ptesync sequence
* in case we interrupted the guest between a tlbie and a
* ptesync.
*/
asm volatile("eieio; tlbsync; ptesync");
}
/*
* cp_abort is required if the processor supports local copy-paste
* to clear the copy buffer that was under control of the guest.
*/
if (cpu_has_feature(CPU_FTR_ARCH_31))
asm volatile(PPC_CP_ABORT);
vc->dpdes = mfspr(SPRN_DPDES);
vc->vtb = mfspr(SPRN_VTB);
mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, PCR_MASK);
if (vc->tb_offset_applied) {
u64 new_tb = mftb() - vc->tb_offset_applied;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = 0;
}
mtspr(SPRN_HDEC, 0x7fffffff);
save_clear_guest_mmu(kvm, vcpu);
switch_mmu_to_host(kvm, host_pidr);
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
/*
* If we are in real mode, only switch MMU on after the MMU is
* switched to host, to avoid the P9_RADIX_PREFETCH_BUG.
*/
__mtmsrd(msr, 0);
end_timing(vcpu);
return trap;
}
EXPORT_SYMBOL_GPL(kvmhv_vcpu_entry_p9);

View File

@ -46,6 +46,10 @@ static int global_invalidates(struct kvm *kvm)
else
global = 1;
/* LPID has been switched to host if in virt mode so can't do local */
if (!global && (mfmsr() & (MSR_IR|MSR_DR)))
global = 1;
if (!global) {
/* any other core might now have stale TLB entries... */
smp_wmb();
@ -56,7 +60,7 @@ static int global_invalidates(struct kvm *kvm)
* so use the bit for the first thread to represent the core.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
cpu = cpu_first_thread_sibling(cpu);
cpu = cpu_first_tlb_thread_sibling(cpu);
cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
}
@ -398,6 +402,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
vcpu->arch.pgdir, true,
&vcpu->arch.regs.gpr[4]);
}
EXPORT_SYMBOL_GPL(kvmppc_h_enter);
#ifdef __BIG_ENDIAN__
#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
@ -542,6 +547,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
&vcpu->arch.regs.gpr[4]);
}
EXPORT_SYMBOL_GPL(kvmppc_h_remove);
long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
{
@ -660,6 +666,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
return ret;
}
EXPORT_SYMBOL_GPL(kvmppc_h_bulk_remove);
long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index, unsigned long avpn)
@ -730,6 +737,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
return H_SUCCESS;
}
EXPORT_SYMBOL_GPL(kvmppc_h_protect);
long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index)
@ -770,6 +778,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
}
return H_SUCCESS;
}
EXPORT_SYMBOL_GPL(kvmppc_h_read);
long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index)
@ -818,6 +827,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
return ret;
}
EXPORT_SYMBOL_GPL(kvmppc_h_clear_ref);
long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index)
@ -865,6 +875,7 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
return ret;
}
EXPORT_SYMBOL_GPL(kvmppc_h_clear_mod);
static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
unsigned long gpa, int writing, unsigned long *hpa,
@ -1283,3 +1294,4 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
return -1; /* send fault up to host kernel mode */
}
EXPORT_SYMBOL_GPL(kvmppc_hpte_hv_fault);

View File

@ -141,13 +141,6 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
return;
}
if (xive_enabled() && kvmhv_on_pseries()) {
/* No XICS access or hypercalls available, too hard */
this_icp->rm_action |= XICS_RM_KICK_VCPU;
this_icp->rm_kick_target = vcpu;
return;
}
/*
* Check if the core is loaded,
* if not, find an available host core to post to wake the VCPU,
@ -771,14 +764,6 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
void __iomem *xics_phys;
int64_t rc;
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
iosync();
plpar_hcall_raw(H_EOI, retbuf, hwirq);
return;
}
rc = pnv_opal_pci_msi_eoi(c, hwirq);
if (rc)

File diff suppressed because it is too large Load Diff

View File

@ -493,7 +493,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
if (!vcpu->arch.pending_exceptions) {
kvm_vcpu_block(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
vcpu->stat.halt_wakeup++;
vcpu->stat.generic.halt_wakeup++;
/* Unset POW bit after we woke up */
msr &= ~MSR_POW;

View File

@ -378,7 +378,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
kvm_vcpu_block(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
vcpu->stat.halt_wakeup++;
vcpu->stat.generic.halt_wakeup++;
return EMULATE_DONE;
case H_LOGICAL_CI_LOAD:
return kvmppc_h_pr_logical_ci_load(vcpu);

View File

@ -164,12 +164,15 @@ kvmppc_interrupt_pr:
/* 64-bit entry. Register usage at this point:
*
* SPRG_SCRATCH0 = guest R13
* R9 = HSTATE_IN_GUEST
* R12 = (guest CR << 32) | exit handler id
* R13 = PACA
* HSTATE.SCRATCH0 = guest R12
* HSTATE.SCRATCH2 = guest R9
*/
#ifdef CONFIG_PPC64
/* Match 32-bit entry */
ld r9,HSTATE_SCRATCH2(r13)
rotldi r12, r12, 32 /* Flip R12 halves for stw */
stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
srdi r12, r12, 32 /* shift trap into low half */

View File

@ -127,6 +127,71 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
/*
* Pull a vcpu's context from the XIVE on guest exit.
* This assumes we are in virtual mode (MMU on)
*/
void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
{
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
if (!vcpu->arch.xive_pushed)
return;
/*
* Should not have been pushed if there is no tima
*/
if (WARN_ON(!tima))
return;
eieio();
/* First load to pull the context, we ignore the value */
__raw_readl(tima + TM_SPC_PULL_OS_CTX);
/* Second load to recover the context state (Words 0 and 1) */
vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
/* Fixup some of the state for the next load */
vcpu->arch.xive_saved_state.lsmfb = 0;
vcpu->arch.xive_saved_state.ack = 0xff;
vcpu->arch.xive_pushed = 0;
eieio();
}
EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
{
void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
if (!esc_vaddr)
return;
/* we are using XIVE with single escalation */
if (vcpu->arch.xive_esc_on) {
/*
* If we still have a pending escalation, abort the cede,
* and we must set PQ to 10 rather than 00 so that we don't
* potentially end up with two entries for the escalation
* interrupt in the XIVE interrupt queue. In that case
* we also don't want to set xive_esc_on to 1 here in
* case we race with xive_esc_irq().
*/
vcpu->arch.ceded = 0;
/*
* The escalation interrupts are special as we don't EOI them.
* There is no need to use the load-after-store ordering offset
* to set PQ to 10 as we won't use StoreEOI.
*/
__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
} else {
vcpu->arch.xive_esc_on = true;
mb();
__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
}
mb();
}
EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
/*
* This is a simple trigger for a generic XIVE IRQ. This must
* only be called for interrupts that support a trigger page
@ -2075,6 +2140,36 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
return 0;
}
int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
/* The VM should have configured XICS mode before doing XICS hcalls. */
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
switch (req) {
case H_XIRR:
return xive_vm_h_xirr(vcpu);
case H_CPPR:
return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
case H_EOI:
return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
case H_IPI:
return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5));
case H_IPOLL:
return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
case H_XIRR_X:
xive_vm_h_xirr(vcpu);
kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
return H_SUCCESS;
}
return H_UNSUPPORTED;
}
EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@ -2257,21 +2352,3 @@ struct kvm_device_ops kvm_xive_ops = {
.get_attr = xive_get_attr,
.has_attr = xive_has_attr,
};
void kvmppc_xive_init_module(void)
{
__xive_vm_h_xirr = xive_vm_h_xirr;
__xive_vm_h_ipoll = xive_vm_h_ipoll;
__xive_vm_h_ipi = xive_vm_h_ipi;
__xive_vm_h_cppr = xive_vm_h_cppr;
__xive_vm_h_eoi = xive_vm_h_eoi;
}
void kvmppc_xive_exit_module(void)
{
__xive_vm_h_xirr = NULL;
__xive_vm_h_ipoll = NULL;
__xive_vm_h_ipi = NULL;
__xive_vm_h_cppr = NULL;
__xive_vm_h_eoi = NULL;
}

View File

@ -289,13 +289,6 @@ extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr);
extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
/*
* Common Xive routines for XICS-over-XIVE and XIVE native
*/

View File

@ -1281,13 +1281,3 @@ struct kvm_device_ops kvm_xive_native_ops = {
.has_attr = kvmppc_xive_native_has_attr,
.mmap = kvmppc_xive_native_mmap,
};
void kvmppc_xive_native_init_module(void)
{
;
}
void kvmppc_xive_native_exit_module(void)
{
;
}

View File

@ -36,29 +36,59 @@
unsigned long kvmppc_booke_handlers;
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
VCPU_STAT("itlb_r", itlb_real_miss_exits),
VCPU_STAT("itlb_v", itlb_virt_miss_exits),
VCPU_STAT("dtlb_r", dtlb_real_miss_exits),
VCPU_STAT("dtlb_v", dtlb_virt_miss_exits),
VCPU_STAT("sysc", syscall_exits),
VCPU_STAT("isi", isi_exits),
VCPU_STAT("dsi", dsi_exits),
VCPU_STAT("inst_emu", emulated_inst_exits),
VCPU_STAT("dec", dec_exits),
VCPU_STAT("ext_intr", ext_intr_exits),
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("doorbell", dbell_exits),
VCPU_STAT("guest doorbell", gdbell_exits),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VM_STAT("remote_tlb_flush", remote_tlb_flush),
{ NULL }
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_ICOUNTER(VM, num_2M_pages),
STATS_DESC_ICOUNTER(VM, num_1G_pages)
};
static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vm_stats_desc),
};
const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, sum_exits),
STATS_DESC_COUNTER(VCPU, mmio_exits),
STATS_DESC_COUNTER(VCPU, signal_exits),
STATS_DESC_COUNTER(VCPU, light_exits),
STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
STATS_DESC_COUNTER(VCPU, syscall_exits),
STATS_DESC_COUNTER(VCPU, isi_exits),
STATS_DESC_COUNTER(VCPU, dsi_exits),
STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
STATS_DESC_COUNTER(VCPU, dec_exits),
STATS_DESC_COUNTER(VCPU, ext_intr_exits),
STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
STATS_DESC_COUNTER(VCPU, halt_successful_wait),
STATS_DESC_COUNTER(VCPU, dbell_exits),
STATS_DESC_COUNTER(VCPU, gdbell_exits),
STATS_DESC_COUNTER(VCPU, ld),
STATS_DESC_COUNTER(VCPU, st),
STATS_DESC_COUNTER(VCPU, pthru_all),
STATS_DESC_COUNTER(VCPU, pthru_host),
STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
};
static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vcpu_stats_desc),
};
/* TODO: use vcpu_printf() */

View File

@ -682,6 +682,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 &&
!kvmppc_hv_ops->enable_dawr1(NULL));
break;
case KVM_CAP_PPC_RPT_INVALIDATE:
r = 1;
break;
#endif
default:
r = 0;

View File

@ -357,30 +357,19 @@ static void __init radix_init_pgtable(void)
}
/* Find out how many PID bits are supported */
if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
if (!mmu_pid_bits)
mmu_pid_bits = 20;
mmu_base_pid = 1;
} else if (cpu_has_feature(CPU_FTR_HVMODE)) {
if (!mmu_pid_bits)
mmu_pid_bits = 20;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (!cpu_has_feature(CPU_FTR_HVMODE) &&
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
/*
* When KVM is possible, we only use the top half of the
* PID space to avoid collisions between host and guest PIDs
* which can cause problems due to prefetch when exiting the
* guest with AIL=3
* Older versions of KVM on these machines perfer if the
* guest only uses the low 19 PID bits.
*/
mmu_base_pid = 1 << (mmu_pid_bits - 1);
#else
mmu_base_pid = 1;
#endif
} else {
/* The guest uses the bottom half of the PID space */
if (!mmu_pid_bits)
mmu_pid_bits = 19;
mmu_base_pid = 1;
} else {
if (!mmu_pid_bits)
mmu_pid_bits = 20;
}
mmu_base_pid = 1;
/*
* Allocate Partition table and process table for the
@ -486,6 +475,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
def = &mmu_psize_defs[idx];
def->shift = shift;
def->ap = ap;
def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
}
/* needed ? */
@ -560,9 +550,13 @@ void __init radix__early_init_devtree(void)
*/
mmu_psize_defs[MMU_PAGE_4K].shift = 12;
mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_4K);
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_64K);
}
/*

View File

@ -20,10 +20,6 @@
#include "internal.h"
#define RIC_FLUSH_TLB 0
#define RIC_FLUSH_PWC 1
#define RIC_FLUSH_ALL 2
/*
* tlbiel instruction for radix, set invalidation
* i.e., r=1 and is=01 or is=10 or is=11
@ -130,6 +126,21 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static __always_inline void __tlbie_pid_lpid(unsigned long pid,
unsigned long lpid,
unsigned long ric)
{
unsigned long rb, rs, prs, r;
rb = PPC_BIT(53); /* IS = 1 */
rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
prs = 1; /* process scoped */
r = 1; /* radix format */
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@ -190,6 +201,23 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
unsigned long lpid,
unsigned long ap, unsigned long ric)
{
unsigned long rb, rs, prs, r;
rb = va & ~(PPC_BITMASK(52, 63));
rb |= ap << PPC_BITLSHIFT(58);
rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
prs = 1; /* process scoped */
r = 1; /* radix format */
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap, unsigned long ric)
{
@ -235,6 +263,22 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
}
}
static inline void fixup_tlbie_va_range_lpid(unsigned long va,
unsigned long pid,
unsigned long lpid,
unsigned long ap)
{
if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
asm volatile("ptesync" : : : "memory");
__tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
}
if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
asm volatile("ptesync" : : : "memory");
__tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
}
}
static inline void fixup_tlbie_pid(unsigned long pid)
{
/*
@ -254,6 +298,25 @@ static inline void fixup_tlbie_pid(unsigned long pid)
}
}
static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
{
/*
* We can use any address for the invalidation, pick one which is
* probably unused as an optimisation.
*/
unsigned long va = ((1UL << 52) - 1);
if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
asm volatile("ptesync" : : : "memory");
__tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
}
if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
asm volatile("ptesync" : : : "memory");
__tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
RIC_FLUSH_TLB);
}
}
static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap)
@ -344,6 +407,31 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
unsigned long ric)
{
asm volatile("ptesync" : : : "memory");
/*
* Workaround the fact that the "ric" argument to __tlbie_pid
* must be a compile-time contraint to match the "i" constraint
* in the asm statement.
*/
switch (ric) {
case RIC_FLUSH_TLB:
__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
fixup_tlbie_pid_lpid(pid, lpid);
break;
case RIC_FLUSH_PWC:
__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
break;
case RIC_FLUSH_ALL:
default:
__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
fixup_tlbie_pid_lpid(pid, lpid);
}
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
}
struct tlbiel_pid {
unsigned long pid;
unsigned long ric;
@ -469,6 +557,20 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
fixup_tlbie_va_range(addr - page_size, pid, ap);
}
static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
unsigned long pid, unsigned long lpid,
unsigned long page_size,
unsigned long psize)
{
unsigned long addr;
unsigned long ap = mmu_get_ap(psize);
for (addr = start; addr < end; addr += page_size)
__tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
}
static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@ -549,6 +651,18 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
unsigned long pid, unsigned long lpid,
unsigned long page_size,
unsigned long psize, bool also_pwc)
{
asm volatile("ptesync" : : : "memory");
if (also_pwc)
__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
__tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
}
static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
@ -1338,47 +1452,57 @@ void radix__flush_tlb_all(void)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
/*
* Performs process-scoped invalidations for a given LPID
* as part of H_RPT_INVALIDATE hcall.
*/
void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
unsigned long type, unsigned long pg_sizes,
unsigned long start, unsigned long end)
{
unsigned long pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
return;
unsigned long psize, nr_pages;
struct mmu_psize_def *def;
bool flush_pid;
/*
* If this context hasn't run on that CPU before and KVM is
* around, there's a slim chance that the guest on another
* CPU just brought in obsolete translation into the TLB of
* this CPU due to a bad prefetch using the guest PID on
* the way into the hypervisor.
*
* We work around this here. If KVM is possible, we check if
* any sibling thread is in KVM. If it is, the window may exist
* and thus we flush that PID from the core.
*
* A potential future improvement would be to mark which PIDs
* have never been used on the system and avoid it if the PID
* is new and the process has no other cpumask bit set.
* A H_RPTI_TYPE_ALL request implies RIC=3, hence
* do a single IS=1 based flush.
*/
if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
int cpu = smp_processor_id();
int sib = cpu_first_thread_sibling(cpu);
bool flush = false;
if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
_tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
return;
}
for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
if (sib == cpu)
continue;
if (!cpu_possible(sib))
continue;
if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
flush = true;
if (type & H_RPTI_TYPE_PWC)
_tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
/* Full PID flush */
if (start == 0 && end == -1)
return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
/* Do range invalidation for all the valid page sizes */
for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
def = &mmu_psize_defs[psize];
if (!(pg_sizes & def->h_rpt_pgsize))
continue;
nr_pages = (end - start) >> def->shift;
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
/*
* If the number of pages spanning the range is above
* the ceiling, convert the request into a full PID flush.
* And since PID flush takes out all the page sizes, there
* is no need to consider remaining page sizes.
*/
if (flush_pid) {
_tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
return;
}
if (flush)
_tlbiel_pid(pid, RIC_FLUSH_ALL);
_tlbie_va_range_lpid(start, end, pid, lpid,
(1UL << def->shift), psize, false);
}
}
EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */

View File

@ -83,9 +83,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (cpu_has_feature(CPU_FTR_ALTIVEC))
asm volatile ("dssall");
if (new_on_cpu)
radix_kvm_prefetch_workaround(next);
else
if (!new_on_cpu)
membarrier_arch_switch_mm(prev, next, tsk);
/*

View File

@ -604,7 +604,7 @@ struct p9_sprs {
u64 uamor;
};
static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
static unsigned long power9_idle_stop(unsigned long psscr)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
@ -620,8 +620,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
BUG_ON(!mmu_on);
/*
* Wake synchronously. SRESET via xscom may still cause
* a 0x100 powersave wakeup with SRR1 reason!
@ -803,8 +801,7 @@ core_woken:
__slb_restore_bolted_realmode();
out:
if (mmu_on)
mtmsr(MSR_KERNEL);
mtmsr(MSR_KERNEL);
return srr1;
}
@ -895,7 +892,7 @@ struct p10_sprs {
*/
};
static unsigned long power10_idle_stop(unsigned long psscr, bool mmu_on)
static unsigned long power10_idle_stop(unsigned long psscr)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
@ -909,8 +906,6 @@ static unsigned long power10_idle_stop(unsigned long psscr, bool mmu_on)
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
BUG_ON(!mmu_on);
/*
* Wake synchronously. SRESET via xscom may still cause
* a 0x100 powersave wakeup with SRR1 reason!
@ -991,8 +986,7 @@ core_woken:
__slb_restore_bolted_realmode();
out:
if (mmu_on)
mtmsr(MSR_KERNEL);
mtmsr(MSR_KERNEL);
return srr1;
}
@ -1002,40 +996,10 @@ static unsigned long arch300_offline_stop(unsigned long psscr)
{
unsigned long srr1;
#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
__ppc64_runlatch_off();
if (cpu_has_feature(CPU_FTR_ARCH_31))
srr1 = power10_idle_stop(psscr, true);
srr1 = power10_idle_stop(psscr);
else
srr1 = power9_idle_stop(psscr, true);
__ppc64_runlatch_on();
#else
/*
* Tell KVM we're entering idle.
* This does not have to be done in real mode because the P9 MMU
* is independent per-thread. Some steppings share radix/hash mode
* between threads, but in that case KVM has a barrier sync in real
* mode before and after switching between radix and hash.
*
* kvm_start_guest must still be called in real mode though, hence
* the false argument.
*/
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
__ppc64_runlatch_off();
if (cpu_has_feature(CPU_FTR_ARCH_31))
srr1 = power10_idle_stop(psscr, false);
else
srr1 = power9_idle_stop(psscr, false);
__ppc64_runlatch_on();
local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
/* Order setting hwthread_state vs. testing hwthread_req */
smp_mb();
if (local_paca->kvm_hstate.hwthread_req)
srr1 = idle_kvm_start_guest(srr1);
mtmsr(MSR_KERNEL);
#endif
srr1 = power9_idle_stop(psscr);
return srr1;
}
@ -1055,9 +1019,9 @@ void arch300_idle_type(unsigned long stop_psscr_val,
__ppc64_runlatch_off();
if (cpu_has_feature(CPU_FTR_ARCH_31))
srr1 = power10_idle_stop(psscr, true);
srr1 = power10_idle_stop(psscr);
else
srr1 = power9_idle_stop(psscr, true);
srr1 = power9_idle_stop(psscr);
__ppc64_runlatch_on();
fini_irq_for_idle_irqsoff();

View File

@ -361,6 +361,7 @@ struct sie_page {
};
struct kvm_vcpu_stat {
struct kvm_vcpu_stat_generic generic;
u64 exit_userspace;
u64 exit_null;
u64 exit_external_request;
@ -370,13 +371,7 @@ struct kvm_vcpu_stat {
u64 exit_validity;
u64 exit_instruction;
u64 exit_pei;
u64 halt_successful_poll;
u64 halt_attempted_poll;
u64 halt_poll_invalid;
u64 halt_no_poll_steal;
u64 halt_wakeup;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 instruction_lctl;
u64 instruction_lctlg;
u64 instruction_stctl;
@ -755,12 +750,12 @@ struct kvm_vcpu_arch {
};
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
u64 inject_io;
u64 inject_float_mchk;
u64 inject_pfault_done;
u64 inject_service_signal;
u64 inject_virtio;
u64 remote_tlb_flush;
};
struct kvm_arch_memory_slot {

View File

@ -4,7 +4,8 @@
# Copyright IBM Corp. 2008
KVM := ../../../virt/kvm
common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o \
$(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm

View File

@ -58,112 +58,132 @@
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("userspace_handled", exit_userspace),
VCPU_STAT("exit_null", exit_null),
VCPU_STAT("pfault_sync", pfault_sync),
VCPU_STAT("exit_validity", exit_validity),
VCPU_STAT("exit_stop_request", exit_stop_request),
VCPU_STAT("exit_external_request", exit_external_request),
VCPU_STAT("exit_io_request", exit_io_request),
VCPU_STAT("exit_external_interrupt", exit_external_interrupt),
VCPU_STAT("exit_instruction", exit_instruction),
VCPU_STAT("exit_pei", exit_pei),
VCPU_STAT("exit_program_interruption", exit_program_interruption),
VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
VCPU_STAT("exit_operation_exception", exit_operation_exception),
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VCPU_STAT("instruction_lctlg", instruction_lctlg),
VCPU_STAT("instruction_lctl", instruction_lctl),
VCPU_STAT("instruction_stctl", instruction_stctl),
VCPU_STAT("instruction_stctg", instruction_stctg),
VCPU_STAT("deliver_ckc", deliver_ckc),
VCPU_STAT("deliver_cputm", deliver_cputm),
VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal),
VCPU_STAT("deliver_external_call", deliver_external_call),
VCPU_STAT("deliver_service_signal", deliver_service_signal),
VCPU_STAT("deliver_virtio", deliver_virtio),
VCPU_STAT("deliver_stop_signal", deliver_stop_signal),
VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal),
VCPU_STAT("deliver_restart_signal", deliver_restart_signal),
VCPU_STAT("deliver_program", deliver_program),
VCPU_STAT("deliver_io", deliver_io),
VCPU_STAT("deliver_machine_check", deliver_machine_check),
VCPU_STAT("exit_wait_state", exit_wait_state),
VCPU_STAT("inject_ckc", inject_ckc),
VCPU_STAT("inject_cputm", inject_cputm),
VCPU_STAT("inject_external_call", inject_external_call),
VM_STAT("inject_float_mchk", inject_float_mchk),
VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
VM_STAT("inject_io", inject_io),
VCPU_STAT("inject_mchk", inject_mchk),
VM_STAT("inject_pfault_done", inject_pfault_done),
VCPU_STAT("inject_program", inject_program),
VCPU_STAT("inject_restart", inject_restart),
VM_STAT("inject_service_signal", inject_service_signal),
VCPU_STAT("inject_set_prefix", inject_set_prefix),
VCPU_STAT("inject_stop_signal", inject_stop_signal),
VCPU_STAT("inject_pfault_init", inject_pfault_init),
VM_STAT("inject_virtio", inject_virtio),
VCPU_STAT("instruction_epsw", instruction_epsw),
VCPU_STAT("instruction_gs", instruction_gs),
VCPU_STAT("instruction_io_other", instruction_io_other),
VCPU_STAT("instruction_lpsw", instruction_lpsw),
VCPU_STAT("instruction_lpswe", instruction_lpswe),
VCPU_STAT("instruction_pfmf", instruction_pfmf),
VCPU_STAT("instruction_ptff", instruction_ptff),
VCPU_STAT("instruction_stidp", instruction_stidp),
VCPU_STAT("instruction_sck", instruction_sck),
VCPU_STAT("instruction_sckpf", instruction_sckpf),
VCPU_STAT("instruction_spx", instruction_spx),
VCPU_STAT("instruction_stpx", instruction_stpx),
VCPU_STAT("instruction_stap", instruction_stap),
VCPU_STAT("instruction_iske", instruction_iske),
VCPU_STAT("instruction_ri", instruction_ri),
VCPU_STAT("instruction_rrbe", instruction_rrbe),
VCPU_STAT("instruction_sske", instruction_sske),
VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock),
VCPU_STAT("instruction_essa", instruction_essa),
VCPU_STAT("instruction_stsi", instruction_stsi),
VCPU_STAT("instruction_stfl", instruction_stfl),
VCPU_STAT("instruction_tb", instruction_tb),
VCPU_STAT("instruction_tpi", instruction_tpi),
VCPU_STAT("instruction_tprot", instruction_tprot),
VCPU_STAT("instruction_tsch", instruction_tsch),
VCPU_STAT("instruction_sthyi", instruction_sthyi),
VCPU_STAT("instruction_sie", instruction_sie),
VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense),
VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running),
VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call),
VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency),
VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency),
VCPU_STAT("instruction_sigp_start", instruction_sigp_start),
VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop),
VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status),
VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status),
VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status),
VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch),
VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix),
VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart),
VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset),
VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset),
VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown),
VCPU_STAT("instruction_diag_10", diagnose_10),
VCPU_STAT("instruction_diag_44", diagnose_44),
VCPU_STAT("instruction_diag_9c", diagnose_9c),
VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
VCPU_STAT("instruction_diag_258", diagnose_258),
VCPU_STAT("instruction_diag_308", diagnose_308),
VCPU_STAT("instruction_diag_500", diagnose_500),
VCPU_STAT("instruction_diag_other", diagnose_other),
{ NULL }
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, inject_io),
STATS_DESC_COUNTER(VM, inject_float_mchk),
STATS_DESC_COUNTER(VM, inject_pfault_done),
STATS_DESC_COUNTER(VM, inject_service_signal),
STATS_DESC_COUNTER(VM, inject_virtio)
};
static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vm_stats_desc),
};
const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
KVM_GENERIC_VCPU_STATS(),
STATS_DESC_COUNTER(VCPU, exit_userspace),
STATS_DESC_COUNTER(VCPU, exit_null),
STATS_DESC_COUNTER(VCPU, exit_external_request),
STATS_DESC_COUNTER(VCPU, exit_io_request),
STATS_DESC_COUNTER(VCPU, exit_external_interrupt),
STATS_DESC_COUNTER(VCPU, exit_stop_request),
STATS_DESC_COUNTER(VCPU, exit_validity),
STATS_DESC_COUNTER(VCPU, exit_instruction),
STATS_DESC_COUNTER(VCPU, exit_pei),
STATS_DESC_COUNTER(VCPU, halt_no_poll_steal),
STATS_DESC_COUNTER(VCPU, instruction_lctl),
STATS_DESC_COUNTER(VCPU, instruction_lctlg),
STATS_DESC_COUNTER(VCPU, instruction_stctl),
STATS_DESC_COUNTER(VCPU, instruction_stctg),
STATS_DESC_COUNTER(VCPU, exit_program_interruption),
STATS_DESC_COUNTER(VCPU, exit_instr_and_program),
STATS_DESC_COUNTER(VCPU, exit_operation_exception),
STATS_DESC_COUNTER(VCPU, deliver_ckc),
STATS_DESC_COUNTER(VCPU, deliver_cputm),
STATS_DESC_COUNTER(VCPU, deliver_external_call),
STATS_DESC_COUNTER(VCPU, deliver_emergency_signal),
STATS_DESC_COUNTER(VCPU, deliver_service_signal),
STATS_DESC_COUNTER(VCPU, deliver_virtio),
STATS_DESC_COUNTER(VCPU, deliver_stop_signal),
STATS_DESC_COUNTER(VCPU, deliver_prefix_signal),
STATS_DESC_COUNTER(VCPU, deliver_restart_signal),
STATS_DESC_COUNTER(VCPU, deliver_program),
STATS_DESC_COUNTER(VCPU, deliver_io),
STATS_DESC_COUNTER(VCPU, deliver_machine_check),
STATS_DESC_COUNTER(VCPU, exit_wait_state),
STATS_DESC_COUNTER(VCPU, inject_ckc),
STATS_DESC_COUNTER(VCPU, inject_cputm),
STATS_DESC_COUNTER(VCPU, inject_external_call),
STATS_DESC_COUNTER(VCPU, inject_emergency_signal),
STATS_DESC_COUNTER(VCPU, inject_mchk),
STATS_DESC_COUNTER(VCPU, inject_pfault_init),
STATS_DESC_COUNTER(VCPU, inject_program),
STATS_DESC_COUNTER(VCPU, inject_restart),
STATS_DESC_COUNTER(VCPU, inject_set_prefix),
STATS_DESC_COUNTER(VCPU, inject_stop_signal),
STATS_DESC_COUNTER(VCPU, instruction_epsw),
STATS_DESC_COUNTER(VCPU, instruction_gs),
STATS_DESC_COUNTER(VCPU, instruction_io_other),
STATS_DESC_COUNTER(VCPU, instruction_lpsw),
STATS_DESC_COUNTER(VCPU, instruction_lpswe),
STATS_DESC_COUNTER(VCPU, instruction_pfmf),
STATS_DESC_COUNTER(VCPU, instruction_ptff),
STATS_DESC_COUNTER(VCPU, instruction_sck),
STATS_DESC_COUNTER(VCPU, instruction_sckpf),
STATS_DESC_COUNTER(VCPU, instruction_stidp),
STATS_DESC_COUNTER(VCPU, instruction_spx),
STATS_DESC_COUNTER(VCPU, instruction_stpx),
STATS_DESC_COUNTER(VCPU, instruction_stap),
STATS_DESC_COUNTER(VCPU, instruction_iske),
STATS_DESC_COUNTER(VCPU, instruction_ri),
STATS_DESC_COUNTER(VCPU, instruction_rrbe),
STATS_DESC_COUNTER(VCPU, instruction_sske),
STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock),
STATS_DESC_COUNTER(VCPU, instruction_stsi),
STATS_DESC_COUNTER(VCPU, instruction_stfl),
STATS_DESC_COUNTER(VCPU, instruction_tb),
STATS_DESC_COUNTER(VCPU, instruction_tpi),
STATS_DESC_COUNTER(VCPU, instruction_tprot),
STATS_DESC_COUNTER(VCPU, instruction_tsch),
STATS_DESC_COUNTER(VCPU, instruction_sie),
STATS_DESC_COUNTER(VCPU, instruction_essa),
STATS_DESC_COUNTER(VCPU, instruction_sthyi),
STATS_DESC_COUNTER(VCPU, instruction_sigp_sense),
STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running),
STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call),
STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency),
STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency),
STATS_DESC_COUNTER(VCPU, instruction_sigp_start),
STATS_DESC_COUNTER(VCPU, instruction_sigp_stop),
STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status),
STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status),
STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status),
STATS_DESC_COUNTER(VCPU, instruction_sigp_arch),
STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix),
STATS_DESC_COUNTER(VCPU, instruction_sigp_restart),
STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
STATS_DESC_COUNTER(VCPU, diagnose_10),
STATS_DESC_COUNTER(VCPU, diagnose_44),
STATS_DESC_COUNTER(VCPU, diagnose_9c),
STATS_DESC_COUNTER(VCPU, diagnose_9c_ignored),
STATS_DESC_COUNTER(VCPU, diagnose_9c_forward),
STATS_DESC_COUNTER(VCPU, diagnose_258),
STATS_DESC_COUNTER(VCPU, diagnose_308),
STATS_DESC_COUNTER(VCPU, diagnose_500),
STATS_DESC_COUNTER(VCPU, diagnose_other),
STATS_DESC_COUNTER(VCPU, pfault_sync)
};
static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
.id_offset = sizeof(struct kvm_stats_header),
.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
sizeof(kvm_vcpu_stats_desc),
};
/* allow nested virtualization in KVM (if enabled by user space) */
@ -329,31 +349,31 @@ static void allow_cpu_feat(unsigned long nr)
static inline int plo_test_bit(unsigned char nr)
{
register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
unsigned long function = (unsigned long)nr | 0x100;
int cc;
asm volatile(
" lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc)
: "d" (r0)
: "cc");
: [function] "d" (function)
: "cc", "0");
return cc == 0;
}
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
{
register unsigned long r0 asm("0") = 0; /* query function */
register unsigned long r1 asm("1") = (unsigned long) query;
asm volatile(
/* Parameter regs are ignored */
" lghi 0,0\n"
" lgr 1,%[query]\n"
/* Parameter registers are ignored */
" .insn rrf,%[opc] << 16,2,4,6,0\n"
:
: "d" (r0), "a" (r1), [opc] "i" (opcode)
: "cc", "memory");
: [query] "d" ((unsigned long)query), [opc] "i" (opcode)
: "cc", "memory", "0", "1");
}
#define INSN_SORTL 0xb938
@ -713,6 +733,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
set_kvm_facility(kvm->arch.model.fac_mask, 152);
set_kvm_facility(kvm->arch.model.fac_list, 152);
}
if (test_facility(192)) {
set_kvm_facility(kvm->arch.model.fac_mask, 192);
set_kvm_facility(kvm->arch.model.fac_list, 192);
}
r = 0;
} else
r = -EINVAL;

View File

@ -115,6 +115,10 @@ static struct facility_def facility_defs[] = {
12, /* AP Query Configuration Information */
15, /* AP Facilities Test */
156, /* etoken facility */
165, /* nnpa facility */
193, /* bear enhancement facility */
194, /* rdp enhancement facility */
196, /* processor activity instrumentation facility */
-1 /* END */
}
},

View File

@ -52,7 +52,7 @@
* Support for passing hypercall input parameter block via XMM
* registers is available
*/
#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4)
#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4)
/* Support for a virtual guest idle state is available */
#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5)
/* Frequency MSRs available */
@ -61,6 +61,11 @@
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
/* Support for debug MSRs available */
#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11)
/*
* Support for returning hypercall output block via XMM
* registers is available
*/
#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15)
/* stimer Direct Mode is available */
#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19)
@ -133,6 +138,15 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
/*
* This is specific to AMD and specifies that enlightened TLB flush is
* supported. If guest opts in to this feature, ASID invalidations only
* flushes gva -> hpa mapping entries. To flush the TLB entries derived
* from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace
* or HvFlushGuestPhysicalAddressList).
*/
#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22)
/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
#define HV_PARAVISOR_PRESENT BIT(0)
@ -314,6 +328,9 @@ struct hv_tsc_emulation_status {
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
/* Number of XMM registers used in hypercall input/output */
#define HV_HYPERCALL_MAX_XMM_REGISTERS 6
struct hv_nested_enlightenments_control {
struct {
__u32 directhypercall:1;

View File

@ -87,7 +87,10 @@ KVM_X86_OP(set_identity_map_addr)
KVM_X86_OP(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_NULL(has_wbinvd_exit)
KVM_X86_OP(write_l1_tsc_offset)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
KVM_X86_OP(write_tsc_offset)
KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
@ -106,8 +109,8 @@ KVM_X86_OP_NULL(set_hv_timer)
KVM_X86_OP_NULL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
KVM_X86_OP(smi_allowed)
KVM_X86_OP(pre_enter_smm)
KVM_X86_OP(pre_leave_smm)
KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
KVM_X86_OP_NULL(mem_enc_op)
KVM_X86_OP_NULL(mem_enc_reg_region)

View File

@ -85,7 +85,7 @@
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_HV_TLB_FLUSH \
#define KVM_REQ_TLB_FLUSH_GUEST \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
@ -269,12 +269,36 @@ enum x86_intercept_stage;
struct kvm_kernel_irq_routing_entry;
/*
* the pages used as guest page table on soft mmu are tracked by
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
* by indirect shadow page can not be more than 15 bits.
* kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
* also includes TDP pages) to determine whether or not a page can be used in
* the given MMU context. This is a subset of the overall kvm_mmu_role to
* minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
* 2 bytes per gfn instead of 4 bytes per gfn.
*
* Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
* @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
* Indirect upper-level shadow pages are tracked for write-protection via
* gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
* more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
* gfn_track will overflow and explosions will ensure.
*
* A unique shadow page (SP) for a gfn is created if and only if an existing SP
* cannot be reused. The ability to reuse a SP is tracked by its role, which
* incorporates various mode bits and properties of the SP. Roughly speaking,
* the number of unique SPs that can theoretically be created is 2^n, where n
* is the number of bits that are used to compute the role.
*
* But, even though there are 18 bits in the mask below, not all combinations
* of modes and flags are possible. The maximum number of possible upper-level
* shadow pages for a single gfn is in the neighborhood of 2^13.
*
* - invalid shadow pages are not accounted.
* - level is effectively limited to four combinations, not 16 as the number
* bits would imply, as 4k SPs are not tracked (allowed to go unsync).
* - level is effectively unused for non-PAE paging because there is exactly
* one upper level (see 4k SP exception above).
* - quadrant is used only for non-PAE paging and is exclusive with
* gpte_is_8_bytes.
* - execonly and ad_disabled are used only for nested EPT, which makes it
* exclusive with quadrant.
*/
union kvm_mmu_page_role {
u32 word;
@ -285,7 +309,7 @@ union kvm_mmu_page_role {
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
unsigned nxe:1;
unsigned efer_nx:1;
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
@ -303,13 +327,26 @@ union kvm_mmu_page_role {
};
};
union kvm_mmu_extended_role {
/*
* This structure complements kvm_mmu_page_role caching everything needed for
* MMU configuration. If nothing in both these structures changed, MMU
* re-configuration can be skipped. @valid bit is set on first usage so we don't
* treat all-zero structure as valid data.
* kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
* relevant to the current MMU configuration. When loading CR0, CR4, or EFER,
* including on nested transitions, if nothing in the full role changes then
* MMU re-configuration can be skipped. @valid bit is set on first usage so we
* don't treat all-zero structure as valid data.
*
* The properties that are tracked in the extended role but not the page role
* are for things that either (a) do not affect the validity of the shadow page
* or (b) are indirectly reflected in the shadow page's role. For example,
* CR4.PKE only affects permission checks for software walks of the guest page
* tables (because KVM doesn't support Protection Keys with shadow paging), and
* CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
*
* Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
* If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
* SMAP, but the MMU's permission checks for software walks need to be SMEP and
* SMAP aware regardless of CR0.WP.
*/
union kvm_mmu_extended_role {
u32 word;
struct {
unsigned int valid:1;
@ -320,7 +357,7 @@ union kvm_mmu_extended_role {
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
unsigned int maxphyaddr:6;
unsigned int cr4_la57:1;
};
};
@ -420,11 +457,6 @@ struct kvm_mmu {
struct rsvd_bits_validate guest_rsvd_check;
/* Can have large pages at levels 2..last_nonleaf_level-1. */
u8 last_nonleaf_level;
bool nx;
u64 pdptrs[4]; /* pae */
};
@ -543,6 +575,15 @@ struct kvm_vcpu_hv {
struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
cpumask_t tlb_flush;
bool enforce_cpuid;
struct {
u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */
u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */
u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
} cpuid_cache;
};
/* Xen HVM per vcpu emulation context */
@ -707,7 +748,7 @@ struct kvm_vcpu_arch {
} st;
u64 l1_tsc_offset;
u64 tsc_offset;
u64 tsc_offset; /* current tsc offset */
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
@ -721,7 +762,8 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
u64 msr_ia32_power_ctl;
u64 tsc_scaling_ratio;
u64 l1_tsc_scaling_ratio;
u64 tsc_scaling_ratio; /* current scaling ratio */
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
@ -829,7 +871,7 @@ struct kvm_vcpu_arch {
bool l1tf_flush_l1d;
/* Host CPU on which VM-entry was most recently attempted */
unsigned int last_vmentry_cpu;
int last_vmentry_cpu;
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
@ -851,6 +893,16 @@ struct kvm_vcpu_arch {
/* Protected Guests */
bool guest_state_protected;
/*
* Set when PDPTS were loaded directly by the userspace without
* reading the guest memory
*/
bool pdptrs_from_userspace;
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
#endif
};
struct kvm_lpage_info {
@ -1002,7 +1054,7 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
bool apic_access_page_done;
bool apic_access_memslot_enabled;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@ -1062,11 +1114,19 @@ struct kvm_arch {
bool exception_payload_enabled;
bool bus_lock_detection_enabled;
/*
* If exit_on_emulation_error is set, and the in-kernel instruction
* emulator fails to emulate an instruction, allow userspace
* the opportunity to look at it.
*/
bool exit_on_emulation_error;
/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
u32 user_space_msr_mask;
struct kvm_x86_msr_filter __rcu *msr_filter;
u32 hypercall_exit_enabled;
/* Guest can access the SGX PROVISIONKEY. */
bool sgx_provisioning_allowed;
@ -1124,23 +1184,35 @@ struct kvm_arch {
*/
spinlock_t tdp_mmu_pages_lock;
#endif /* CONFIG_X86_64 */
/*
* If set, rmaps have been allocated for all memslots and should be
* allocated for any newly created or modified memslots.
*/
bool memslots_have_rmaps;
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
spinlock_t hv_root_tdp_lock;
#endif
};
struct kvm_vm_stat {
ulong mmu_shadow_zapped;
ulong mmu_pte_write;
ulong mmu_pde_zapped;
ulong mmu_flooded;
ulong mmu_recycled;
ulong mmu_cache_miss;
ulong mmu_unsync;
ulong remote_tlb_flush;
ulong lpages;
ulong nx_lpage_splits;
ulong max_mmu_page_hash_collisions;
struct kvm_vm_stat_generic generic;
u64 mmu_shadow_zapped;
u64 mmu_pte_write;
u64 mmu_pde_zapped;
u64 mmu_flooded;
u64 mmu_recycled;
u64 mmu_cache_miss;
u64 mmu_unsync;
u64 lpages;
u64 nx_lpage_splits;
u64 max_mmu_page_hash_collisions;
};
struct kvm_vcpu_stat {
struct kvm_vcpu_stat_generic generic;
u64 pf_fixed;
u64 pf_guest;
u64 tlb_flush;
@ -1154,10 +1226,6 @@ struct kvm_vcpu_stat {
u64 nmi_window_exits;
u64 l1d_flush;
u64 halt_exits;
u64 halt_successful_poll;
u64 halt_attempted_poll;
u64 halt_poll_invalid;
u64 halt_wakeup;
u64 request_irq_exits;
u64 irq_exits;
u64 host_state_reload;
@ -1168,11 +1236,10 @@ struct kvm_vcpu_stat {
u64 irq_injections;
u64 nmi_injections;
u64 req_event;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 nested_run;
u64 directed_yield_attempted;
u64 directed_yield_successful;
u64 guest_mode;
};
struct x86_instruction_info;
@ -1304,8 +1371,10 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
/* Returns actual tsc_offset set in active VMCS */
u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
/*
* Retrieve somewhat arbitrary exit information. Intended to be used
@ -1363,8 +1432,8 @@ struct kvm_x86_ops {
void (*setup_mce)(struct kvm_vcpu *vcpu);
int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
@ -1423,6 +1492,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP(func) \
@ -1463,6 +1533,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot,
@ -1477,7 +1548,6 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
bool pdptrs_changed(struct kvm_vcpu *vcpu);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@ -1650,6 +1720,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
@ -1662,7 +1733,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
bool kvm_apicv_activated(struct kvm *kvm);
void kvm_apicv_init(struct kvm *kvm, bool enable);
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void kvm_request_apicv_update(struct kvm *kvm, bool activate,
unsigned long bit);
@ -1675,8 +1745,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t gva, hpa_t root_hpa);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
bool skip_mmu_sync);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
int tdp_huge_page_level);
@ -1788,8 +1857,10 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr)
return kvm_find_user_return_msr(msr) >= 0;
}
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
@ -1863,4 +1934,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
int kvm_cpu_dirty_log_size(void);
int alloc_all_memslots_rmaps(struct kvm *kvm);
#endif /* _ASM_X86_KVM_HOST_H */

View File

@ -156,6 +156,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
u8 reserved_8[720];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
*/
u8 reserved_sw[32];
};
@ -314,7 +320,7 @@ struct ghcb {
#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
@ -326,7 +332,6 @@ static inline void __unused_size_checks(void)
struct vmcb {
struct vmcb_control_area control;
u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save;
} __packed;

View File

@ -159,6 +159,19 @@ struct kvm_sregs {
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
};
struct kvm_sregs2 {
/* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
struct kvm_segment cs, ds, es, fs, gs, ss;
struct kvm_segment tr, ldt;
struct kvm_dtable gdt, idt;
__u64 cr0, cr2, cr3, cr4, cr8;
__u64 efer;
__u64 apic_base;
__u64 flags;
__u64 pdptrs[4];
};
#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1
/* for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];

View File

@ -33,6 +33,8 @@
#define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14
#define KVM_FEATURE_MSI_EXT_DEST_ID 15
#define KVM_FEATURE_HC_MAP_GPA_RANGE 16
#define KVM_FEATURE_MIGRATION_CONTROL 17
#define KVM_HINTS_REALTIME 0
@ -54,6 +56,7 @@
#define MSR_KVM_POLL_CONTROL 0x4b564d05
#define MSR_KVM_ASYNC_PF_INT 0x4b564d06
#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08
struct kvm_steal_time {
__u64 steal;
@ -90,6 +93,16 @@ struct kvm_clock_pairing {
/* MSR_KVM_ASYNC_PF_INT */
#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0)
/* MSR_KVM_MIGRATION_CONTROL */
#define KVM_MIGRATION_READY (1 << 0)
/* KVM_HC_MAP_GPA_RANGE */
#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0
#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0)
#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1)
#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4)
#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1)
#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0)
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1

Some files were not shown because too many files have changed in this diff Show More