From c9b5ad546e7d486465a3dd8c89245ac3707a4384 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 14 Jun 2016 12:56:01 +0200 Subject: [PATCH 01/14] s390/mm: tag normal pages vs pages used in page tables The ESSA instruction has a new option that allows to tag pages that are not used as a page table. Without the tag the hypervisor has to assume that any guest page could be used in a page table inside the guest. This forces the hypervisor to flush all guest TLB entries whenever a host page table entry is invalidated. With the tag the host can skip the TLB flush if the page is tagged as normal page. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/page-states.h | 1 + arch/s390/include/asm/page.h | 3 + arch/s390/include/asm/setup.h | 3 +- arch/s390/kernel/suspend.c | 22 +++- arch/s390/kernel/vdso.c | 2 + arch/s390/mm/init.c | 2 + arch/s390/mm/page-states.c | 196 +++++++++++++++++++++++++--- arch/s390/mm/pgalloc.c | 2 + 8 files changed, 209 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 42267a2fe29e..ca21b28a7b17 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -13,6 +13,7 @@ #define ESSA_SET_POT_VOLATILE 4 #define ESSA_SET_STABLE_RESIDENT 5 #define ESSA_SET_STABLE_IF_RESIDENT 6 +#define ESSA_SET_STABLE_NODAT 7 #define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 624deaa44230..b463df89f344 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -133,6 +133,9 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +void arch_set_page_dat(struct page *page, int order); +void arch_set_page_nodat(struct page *page, int order); +int arch_test_page_nodat(struct page *page); void arch_set_page_states(int make_stable); static inline int devmem_is_allowed(unsigned long pfn) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index cd78155b1829..11e59ba4b521 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -106,7 +106,8 @@ extern void pfault_fini(void); void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -extern void cmma_init(void); +void cmma_init(void); +void cmma_init_nodat(void); extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index 39e2f41b6cf0..c8ea715bfe10 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c @@ -98,10 +98,16 @@ int page_key_alloc(unsigned long pages) */ void page_key_read(unsigned long *pfn) { + struct page *page; unsigned long addr; + unsigned char key; - addr = (unsigned long) page_address(pfn_to_page(*pfn)); - *(unsigned char *) pfn = (unsigned char) page_get_storage_key(addr); + page = pfn_to_page(*pfn); + addr = (unsigned long) page_address(page); + key = (unsigned char) page_get_storage_key(addr) & 0x7f; + if (arch_test_page_nodat(page)) + key |= 0x80; + *(unsigned char *) pfn = key; } /* @@ -126,8 +132,16 @@ void page_key_memorize(unsigned long *pfn) */ void page_key_write(void *address) { - page_set_storage_key((unsigned long) address, - page_key_rp->data[page_key_rx], 0); + struct page *page; + unsigned char key; + + key = page_key_rp->data[page_key_rx]; + page_set_storage_key((unsigned long) address, key & 0x7f, 0); + page = virt_to_page(address); + if (key & 0x80) + arch_set_page_nodat(page, 0); + else + arch_set_page_dat(page, 0); if (++page_key_rx >= PAGE_KEY_DATA_SIZE) return; page_key_rp = page_key_rp->next; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index b89d19f6f2ab..eacda05b45d7 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -157,6 +157,8 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; + arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); + arch_set_page_dat(virt_to_page(page_table), 0); /* Initialize per-cpu vdso data page */ vd = (struct vdso_per_cpu_data *) page_frame; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8111694ce55a..3aee54b2ba60 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -137,6 +137,8 @@ void __init mem_init(void) free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ + cmma_init_nodat(); + mem_init_print_info(NULL); } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 69a7b01ae746..07fa7b8ae233 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -10,9 +10,10 @@ #include #include #include +#include #include #include - +#include #include static int cmma_flag = 1; @@ -36,14 +37,16 @@ __setup("cmma=", cmma); static inline int cmma_test_essa(void) { register unsigned long tmp asm("0") = 0; - register int rc asm("1") = -EOPNOTSUPP; + register int rc asm("1"); + /* test ESSA_GET_STATE */ asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,0,0\n" + " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" "0: la %0,0\n" "1:\n" EX_TABLE(0b,1b) - : "+&d" (rc), "+&d" (tmp)); + : "=&d" (rc), "+&d" (tmp) + : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); return rc; } @@ -51,11 +54,26 @@ void __init cmma_init(void) { if (!cmma_flag) return; - if (cmma_test_essa()) + if (cmma_test_essa()) { cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; } -static inline void set_page_unstable(struct page *page, int order) +static inline unsigned char get_page_state(struct page *page) +{ + unsigned char state; + + asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (state) + : "a" (page_to_phys(page)), + "i" (ESSA_GET_STATE)); + return state & 0x3f; +} + +static inline void set_page_unused(struct page *page, int order) { int i, rc; @@ -66,14 +84,7 @@ static inline void set_page_unstable(struct page *page, int order) "i" (ESSA_SET_UNUSED)); } -void arch_free_page(struct page *page, int order) -{ - if (!cmma_flag) - return; - set_page_unstable(page, order); -} - -static inline void set_page_stable(struct page *page, int order) +static inline void set_page_stable_dat(struct page *page, int order) { int i, rc; @@ -84,11 +95,162 @@ static inline void set_page_stable(struct page *page, int order) "i" (ESSA_SET_STABLE)); } +static inline void set_page_stable_nodat(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = virt_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = virt_to_page(pud_val(*pud)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = virt_to_page(p4d_val(*p4d)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = virt_to_page(pgd_val(*pgd)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct memblock_region *reg; + struct page *page; + unsigned long start, end, ix; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); + end = memblock_region_memory_end_pfn(reg); + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); +} + void arch_alloc_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable(page, order); + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} + +void arch_set_page_nodat(struct page *page, int order) +{ + if (cmma_flag < 2) + return; + set_page_stable_nodat(page, order); +} + +int arch_test_page_nodat(struct page *page) +{ + unsigned char state; + + if (cmma_flag < 2) + return 0; + state = get_page_state(page); + return !!(state & 0x20); } void arch_set_page_states(int make_stable) @@ -108,9 +270,9 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable(page, order); + set_page_stable_dat(page, 0); else - set_page_unstable(page, order); + set_page_unused(page, order); } } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 18918e394ce4..a4de34ce392c 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -57,6 +57,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!page) return NULL; + arch_set_page_dat(page, 2); return (unsigned long *) page_to_phys(page); } @@ -214,6 +215,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) __free_page(page); return NULL; } + arch_set_page_dat(page, 0); /* Initialize page table */ table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { From 118bd31bea2cdb7f1dbf22dd9a58e818b5313156 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 26 Jul 2016 16:53:09 +0200 Subject: [PATCH 02/14] s390/mm: add no-dat TLB flush optimization Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 41 ++++++++---- arch/s390/include/asm/setup.h | 6 +- arch/s390/include/asm/tlbflush.h | 5 +- arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 111 ++++++++++++++++++++++++------- drivers/s390/char/sclp_early.c | 6 +- 6 files changed, 129 insertions(+), 42 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57057fb1cc07..c92713893313 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -952,15 +952,27 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 -static inline void __ptep_ipte(unsigned long address, pte_t *ptep, int local) +#define IPTE_NODAT 0x400 + +static inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidation + TLB flush for the pte */ + if (__builtin_constant_p(opt) && opt == 0) { + /* Invalidation + TLB flush for the pte */ + asm volatile( + " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); + return; + } + + /* Invalidate ptes with options + TLB flush of the ptes */ asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" - : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), - [m4] "i" (local)); + " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (opt) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); } static inline void __ptep_ipte_range(unsigned long address, int nr, @@ -1341,31 +1353,36 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_GLOBAL 0 #define IDTE_LOCAL 1 -static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp, int local) +#define IDTE_PTOA 0x0800 +#define IDTE_NODAT 0x1000 + +static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, int local) { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); + sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); asm volatile( " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((address & HPAGE_MASK)), + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), [m4] "i" (local) : "cc" ); } -static inline void __pudp_idte(unsigned long address, pud_t *pudp, int local) +static inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, int local) { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t); + r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; asm volatile( " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((address & PUD_MASK)), + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), [m4] "i" (local) - : "cc"); + : "cc" ); } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 11e59ba4b521..49c425903894 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -29,8 +29,9 @@ #define MACHINE_FLAG_TE _BITUL(11) #define MACHINE_FLAG_TLB_LC _BITUL(12) #define MACHINE_FLAG_VX _BITUL(13) -#define MACHINE_FLAG_NX _BITUL(14) -#define MACHINE_FLAG_GS _BITUL(15) +#define MACHINE_FLAG_TLB_GUEST _BITUL(14) +#define MACHINE_FLAG_NX _BITUL(15) +#define MACHINE_FLAG_GS _BITUL(16) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -68,6 +69,7 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) +#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 39846100682a..38d82ed60345 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -20,10 +20,13 @@ static inline void __tlb_flush_local(void) */ static inline void __tlb_flush_idte(unsigned long asce) { + unsigned long opt; + + opt = IDTE_PTOA; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (2048), "a" (asce) : "cc"); + : : "a" (opt), "a" (asce) : "cc"); } #ifdef CONFIG_SMP diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 180481589246..5734b01ca765 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -328,7 +328,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index d4d409ba206b..9696bf89f03a 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -25,6 +25,38 @@ #include #include +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL) + opt |= IPTE_NODAT; + __ptep_ipte(addr, ptep, opt, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL) + opt |= IPTE_NODAT; + __ptep_ipte(addr, ptep, opt, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, IPTE_GLOBAL); + } +} + static inline pte_t ptep_flush_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -36,9 +68,9 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_ipte(addr, ptep, IPTE_LOCAL); + ptep_ipte_local(mm, addr, ptep); else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep); atomic_dec(&mm->context.flush_count); return old; } @@ -57,7 +89,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep); atomic_dec(&mm->context.flush_count); return old; } @@ -290,6 +322,26 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_modify_prot_commit); +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, IDTE_LOCAL); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, pmdp, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); +} + static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { @@ -298,16 +350,12 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, old = *pmdp; if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - __pmdp_csp(pmdp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pmdp_idte(addr, pmdp, IDTE_LOCAL); + pmdp_idte_local(mm, addr, pmdp); else - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); + pmdp_idte_global(mm, addr, pmdp); atomic_dec(&mm->context.flush_count); return old; } @@ -325,10 +373,9 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; - } else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); + } else { + pmdp_idte_global(mm, addr, pmdp); + } atomic_dec(&mm->context.flush_count); return old; } @@ -359,6 +406,30 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, IDTE_LOCAL); +} + +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, IDTE_GLOBAL); + else + /* + * Invalid bit position is the same for pmd and pud, so we can + * re-use _pmd_csp() here + */ + __pmdp_csp((pmd_t *) pudp); +} + static inline pud_t pudp_flush_direct(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { @@ -367,20 +438,12 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm, old = *pudp; if (pud_val(old) & _REGION_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - /* - * Invalid bit position is the same for pmd and pud, so we can - * re-use _pmd_csp() here - */ - __pmdp_csp((pmd_t *) pudp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pudp_idte(addr, pudp, IDTE_LOCAL); + pudp_idte_local(mm, addr, pudp); else - __pudp_idte(addr, pudp, IDTE_GLOBAL); + pudp_idte_global(mm, addr, pudp); atomic_dec(&mm->context.flush_count); return old; } @@ -645,7 +708,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index efd84d1d178b..bc1fc00910b0 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -39,7 +39,7 @@ struct read_info_sccb { u8 fac84; /* 84 */ u8 fac85; /* 85 */ u8 _pad_86[91 - 86]; /* 86-90 */ - u8 flags; /* 91 */ + u8 fac91; /* 91 */ u8 _pad_92[98 - 92]; /* 92-97 */ u8 fac98; /* 98 */ u8 hamaxpow; /* 99 */ @@ -103,6 +103,8 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.has_kss = !!(sccb->fac98 & 0x01); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; + if (sccb->fac91 & 0x40) + S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_GUEST; sclp.rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; sclp.rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; sclp.rzm <<= 20; @@ -139,7 +141,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) /* Save IPL information */ sclp_ipl_info.is_valid = 1; - if (sccb->flags & 0x2) + if (sccb->fac91 & 0x2) sclp_ipl_info.has_dump = 1; memcpy(&sclp_ipl_info.loadparm, &sccb->loadparm, LOADPARM_LEN); From 28c807e5132ecc9f1607461eabfa1fc67b21e163 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 26 Jul 2016 16:00:22 +0200 Subject: [PATCH 03/14] s390/mm: add guest ASCE TLB flush optimization Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 58 +++++++++++++++++++++++--------- arch/s390/include/asm/tlbflush.h | 2 ++ arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 36 +++++++++++++------- 4 files changed, 70 insertions(+), 28 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c92713893313..b20b0f7170e8 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -953,9 +953,11 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_LOCAL 1 #define IPTE_NODAT 0x400 +#define IPTE_GUEST_ASCE 0x800 static inline void __ptep_ipte(unsigned long address, pte_t *ptep, - unsigned long opt, int local) + unsigned long opt, unsigned long asce, + int local) { unsigned long pto = (unsigned long) ptep; @@ -969,6 +971,7 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep, } /* Invalidate ptes with options + TLB flush of the ptes */ + opt = opt | (asce & _ASCE_ORIGIN); asm volatile( " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" : [r2] "+a" (address), [r3] "+a" (opt) @@ -1355,34 +1358,59 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_PTOA 0x0800 #define IDTE_NODAT 0x1000 +#define IDTE_GUEST_ASCE 0x2000 static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, - unsigned long opt, int local) + unsigned long opt, unsigned long asce, + int local) { unsigned long sto; sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), - [m4] "i" (local) - : "cc" ); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), + [m4] "i" (local) + : "cc" ); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } static inline void __pudp_idte(unsigned long addr, pud_t *pudp, - unsigned long opt, int local) + unsigned long opt, unsigned long asce, + int local) { unsigned long r3o; r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), - [m4] "i" (local) - : "cc" ); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), + [m4] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 38d82ed60345..4d759f8f4bc7 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -23,6 +23,8 @@ static inline void __tlb_flush_idte(unsigned long asce) unsigned long opt; opt = IDTE_PTOA; + if (MACHINE_HAS_TLB_GUEST) + opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 5734b01ca765..567fe92e2bb8 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -328,7 +328,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, 0, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9696bf89f03a..3f1abc7b5fd2 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -35,9 +35,13 @@ static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL) opt |= IPTE_NODAT; - __ptep_ipte(addr, ptep, opt, IPTE_LOCAL); + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); } else { - __ptep_ipte(addr, ptep, 0, IPTE_LOCAL); + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); } } @@ -51,9 +55,13 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL) opt |= IPTE_NODAT; - __ptep_ipte(addr, ptep, opt, IPTE_GLOBAL); + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); } else { - __ptep_ipte(addr, ptep, 0, IPTE_GLOBAL); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); } } @@ -326,18 +334,20 @@ static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_LOCAL); + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); else - __pmdp_idte(addr, pmdp, 0, IDTE_LOCAL); + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); } static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_GLOBAL); + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, 0, IDTE_GLOBAL); + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); } @@ -410,18 +420,20 @@ static inline void pudp_idte_local(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { if (MACHINE_HAS_TLB_GUEST) - __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_LOCAL); + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); else - __pudp_idte(addr, pudp, 0, IDTE_LOCAL); + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); } static inline void pudp_idte_global(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { if (MACHINE_HAS_TLB_GUEST) - __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_GLOBAL); + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); else if (MACHINE_HAS_IDTE) - __pudp_idte(addr, pudp, 0, IDTE_GLOBAL); + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); else /* * Invalid bit position is the same for pmd and pud, so we can From cd774b9076845312caf76ad52523bd9219e89ae1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 26 Jul 2016 17:02:31 +0200 Subject: [PATCH 04/14] s390/mm,kvm: use nodat PGSTE tag to optimize TLB flushing Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 1 + arch/s390/mm/pgtable.c | 47 +++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b20b0f7170e8..bb59a0aa3249 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -376,6 +376,7 @@ static inline int is_module_addr(void *addr) /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_NODAT 0x0000000040000000UL #define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL #define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL #define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 3f1abc7b5fd2..8d018c76ee85 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -26,14 +26,14 @@ #include static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, int nodat) { unsigned long opt, asce; if (MACHINE_HAS_TLB_GUEST) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); - if (asce == 0UL) + if (asce == 0UL || nodat) opt |= IPTE_NODAT; if (asce != -1UL) { asce = asce ? : mm->context.asce; @@ -46,14 +46,14 @@ static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, } static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, int nodat) { unsigned long opt, asce; if (MACHINE_HAS_TLB_GUEST) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); - if (asce == 0UL) + if (asce == 0UL || nodat) opt |= IPTE_NODAT; if (asce != -1UL) { asce = asce ? : mm->context.asce; @@ -66,7 +66,8 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, } static inline pte_t ptep_flush_direct(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -76,15 +77,16 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - ptep_ipte_local(mm, addr, ptep); + ptep_ipte_local(mm, addr, ptep, nodat); else - ptep_ipte_global(mm, addr, ptep); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } static inline pte_t ptep_flush_lazy(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -97,7 +99,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - ptep_ipte_global(mm, addr, ptep); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } @@ -269,10 +271,12 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_direct(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -284,10 +288,12 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -299,10 +305,12 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); if (mm_has_pgste(mm)) { pgste = pgste_update_all(old, pgste, mm); pgste_set(ptep, pgste); @@ -557,7 +565,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, { pte_t entry; pgste_t pgste; - int pte_i, pte_p; + int pte_i, pte_p, nodat; pgste = pgste_get_lock(ptep); entry = *ptep; @@ -570,13 +578,14 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, return -EAGAIN; } /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); pte_val(entry) |= _PAGE_INVALID; } if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) |= _PAGE_PROTECT; } @@ -616,10 +625,12 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) { pgste_t pgste; + int nodat; pgste = pgste_get_lock(ptep); /* notifier is called by the caller */ - ptep_flush_direct(mm, saddr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); /* don't touch the storage key - it belongs to parent pgste */ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); pgste_set_unlock(ptep, pgste); @@ -692,6 +703,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte_t *ptep; pte_t pte; bool dirty; + int nodat; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); @@ -720,7 +732,8 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - ptep_ipte_global(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else From 4a4eefcd0e49f9f339933324c1bde431186a0a7d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 3 Aug 2017 13:05:11 +0200 Subject: [PATCH 05/14] KVM: s390: sthyi: fix sthyi inline assembly The sthyi inline assembly misses register r3 within the clobber list. The sthyi instruction will always write a return code to register "R2+1", which in this case would be r3. Due to that we may have register corruption and see host crashes or data corruption depending on how gcc decided to allocate and use registers during compile time. Fixes: 95ca2cb57985 ("KVM: s390: Add sthyi emulation") Cc: # 4.8+ Reviewed-by: Janosch Frank Signed-off-by: Heiko Carstens Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/sthyi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index 926b5244263e..2773a2f6a5c4 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -394,7 +394,7 @@ static int sthyi(u64 vaddr) "srl %[cc],28\n" : [cc] "=d" (cc) : [code] "d" (code), [addr] "a" (addr) - : "memory", "cc"); + : "3", "memory", "cc"); return cc; } From 857b8de96795646c5891cf44ae6fb19b9ff74bf9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 3 Aug 2017 14:27:30 +0200 Subject: [PATCH 06/14] KVM: s390: sthyi: fix specification exception detection sthyi should only generate a specification exception if the function code is zero and the response buffer is not on a 4k boundary. The current code would also test for unknown function codes if the response buffer, that is currently only defined for function code 0, is not on a 4k boundary and incorrectly inject a specification exception instead of returning with condition code 3 and return code 4 (unsupported function code). Fix this by moving the boundary check. Fixes: 95ca2cb57985 ("KVM: s390: Add sthyi emulation") Cc: # 4.8+ Reviewed-by: Janosch Frank Signed-off-by: Heiko Carstens Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/sthyi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index 2773a2f6a5c4..a2e5c24f47a7 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -425,7 +425,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); trace_kvm_s390_handle_sthyi(vcpu, code, addr); - if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK) + if (reg1 == reg2 || reg1 & 1 || reg2 & 1) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); if (code & 0xffff) { @@ -433,6 +433,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu) goto out; } + if (addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* * If the page has not yet been faulted in, we want to do that * now and not after all the expensive calculations. From b697e435aeee99b7f5b2d8f8dbb51f791be99b16 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Fri, 24 Feb 2017 10:01:30 -0500 Subject: [PATCH 07/14] KVM: s390: Support Configuration z/Architecture Mode kvm has always supported the concept of starting in z/Arch mode so let's reflect the feature bit to the guest. Also, we change sigp set architecture to reject any request to change architecture modes. Signed-off-by: Jason J. Herne Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/sigp.c | 36 +++++++++++++++----------------- arch/s390/tools/gen_facilities.c | 1 + 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 1a252f537081..9d592ef4104b 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -155,29 +155,26 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, return rc; } -static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) +static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, + u64 *status_reg) { - int rc; unsigned int i; struct kvm_vcpu *v; + bool all_stopped = true; - switch (parameter & 0xff) { - case 0: - rc = SIGP_CC_NOT_OPERATIONAL; - break; - case 1: - case 2: - kvm_for_each_vcpu(i, v, vcpu->kvm) { - v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(v); - } - - rc = SIGP_CC_ORDER_CODE_ACCEPTED; - break; - default: - rc = -EOPNOTSUPP; + kvm_for_each_vcpu(i, v, vcpu->kvm) { + if (v == vcpu) + continue; + if (!is_vcpu_stopped(v)) + all_stopped = false; } - return rc; + + *status_reg &= 0xffffffff00000000UL; + + /* Reject set arch order, with czam we're always in z/Arch mode. */ + *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER : + SIGP_STATUS_INCORRECT_STATE); + return SIGP_CC_STATUS_STORED; } static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, @@ -446,7 +443,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) switch (order_code) { case SIGP_SET_ARCHITECTURE: vcpu->stat.instruction_sigp_arch++; - rc = __sigp_set_arch(vcpu, parameter); + rc = __sigp_set_arch(vcpu, parameter, + &vcpu->run->s.regs.gprs[r1]); break; default: rc = handle_sigp_dst(vcpu, order_code, cpu_addr, diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 025ea20fc4b4..181db5b8f8b3 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -80,6 +80,7 @@ static struct facility_def facility_defs[] = { 78, /* enhanced-DAT 2 */ 130, /* instruction-execution-protection */ 131, /* enhanced-SOP 2 and side-effect */ + 138, /* configuration z/architecture mode (czam) */ 146, /* msa extension 8 */ -1 /* END */ } From 8fa1696ea78162ca3112a26879d9379483443c85 Mon Sep 17 00:00:00 2001 From: "Collin L. Walling" Date: Tue, 26 Jul 2016 15:29:44 -0400 Subject: [PATCH 08/14] KVM: s390: Multiple Epoch Facility support Allow for the enablement of MEF and the support for the extended epoch in SIE and VSIE for the extended guest TOD-Clock. A new interface is used for getting/setting a guest's extended TOD-Clock that uses a single ioctl invocation, KVM_S390_VM_TOD_EXT. Since the host time is a moving target that might see an epoch switch or STP sync checks we need an atomic ioctl and cannot use the exisiting two interfaces. The old method of getting and setting the guest TOD-Clock is still retained and is used when the old ioctls are called. Signed-off-by: Collin L. Walling Reviewed-by: Janosch Frank Reviewed-by: Claudio Imbrenda Reviewed-by: Jason J. Herne Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/devices/vm.txt | 14 +++- arch/s390/include/asm/kvm_host.h | 6 +- arch/s390/include/uapi/asm/kvm.h | 6 ++ arch/s390/kvm/kvm-s390.c | 101 +++++++++++++++++++++++ arch/s390/kvm/kvm-s390.h | 2 + arch/s390/kvm/vsie.c | 10 +++ arch/s390/tools/gen_facilities.c | 1 + 7 files changed, 138 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index 903fc926860b..95ca68d663a4 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -176,7 +176,8 @@ Architectures: s390 3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH -Allows user space to set/get the TOD clock extension (u8). +Allows user space to set/get the TOD clock extension (u8) (superseded by +KVM_S390_VM_TOD_EXT). Parameters: address of a buffer in user space to store the data (u8) to Returns: -EFAULT if the given address is not accessible from kernel space @@ -190,6 +191,17 @@ the POP (u64). Parameters: address of a buffer in user space to store the data (u64) to Returns: -EFAULT if the given address is not accessible from kernel space +3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it +also allows user space to get/set it. If the guest CPU model does not support +it, it is stored as 0 and not allowed to be set to a value != 0. + +Parameters: address of a buffer in user space to store the data + (kvm_s390_vm_tod_clock) to +Returns: -EFAULT if the given address is not accessible from kernel space + -EINVAL if setting the TOD clock extension to != 0 is not supported + 4. GROUP: KVM_S390_VM_CRYPTO Architectures: s390 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index a409d5991934..51375e766e90 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -226,7 +226,9 @@ struct kvm_s390_sie_block { #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ - __u8 reserved68[4]; /* 0x0068 */ + __u8 reserved68; /* 0x0068 */ + __u8 epdx; /* 0x0069 */ + __u8 reserved6a[2]; /* 0x006a */ __u32 todpr; /* 0x006c */ __u8 reserved70[16]; /* 0x0070 */ __u64 mso; /* 0x0080 */ @@ -265,6 +267,7 @@ struct kvm_s390_sie_block { __u64 cbrlo; /* 0x01b8 */ __u8 reserved1c0[8]; /* 0x01c0 */ #define ECD_HOSTREGMGMT 0x20000000 +#define ECD_MEF 0x08000000 __u32 ecd; /* 0x01c8 */ __u8 reserved1cc[18]; /* 0x01cc */ __u64 pp; /* 0x01de */ @@ -739,6 +742,7 @@ struct kvm_arch{ struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; struct kvm_s390_vsie vsie; + u8 epdx; u64 epoch; struct kvm_s390_migration_state *migration_state; /* subset of available cpu features enabled by user space */ diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 69d09c39bbcd..cd7359e23d86 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -88,6 +88,12 @@ struct kvm_s390_io_adapter_req { /* kvm attributes for KVM_S390_VM_TOD */ #define KVM_S390_VM_TOD_LOW 0 #define KVM_S390_VM_TOD_HIGH 1 +#define KVM_S390_VM_TOD_EXT 2 + +struct kvm_s390_vm_tod_clock { + __u8 epoch_idx; + __u64 tod; +}; /* kvm attributes for KVM_S390_VM_CPU_MODEL */ /* processor related attributes are r/w */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index af09d3437631..e65b7637cc45 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -130,6 +130,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +struct kvm_s390_tod_clock_ext { + __u8 epoch_idx; + __u64 tod; + __u8 reserved[7]; +} __packed; + /* allow nested virtualization in KVM (if enabled by user space) */ static int nested; module_param(nested, int, S_IRUGO); @@ -874,6 +880,26 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm, return 0; } +static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_tod_clock gtod; + + if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) + return -EFAULT; + + if (test_kvm_facility(kvm, 139)) + kvm_s390_set_tod_clock_ext(kvm, >od); + else if (gtod.epoch_idx == 0) + kvm_s390_set_tod_clock(kvm, gtod.tod); + else + return -EINVAL; + + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", + gtod.epoch_idx, gtod.tod); + + return 0; +} + static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) { u8 gtod_high; @@ -909,6 +935,9 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; switch (attr->attr) { + case KVM_S390_VM_TOD_EXT: + ret = kvm_s390_set_tod_ext(kvm, attr); + break; case KVM_S390_VM_TOD_HIGH: ret = kvm_s390_set_tod_high(kvm, attr); break; @@ -922,6 +951,43 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static void kvm_s390_get_tod_clock_ext(struct kvm *kvm, + struct kvm_s390_vm_tod_clock *gtod) +{ + struct kvm_s390_tod_clock_ext htod; + + preempt_disable(); + + get_tod_clock_ext((char *)&htod); + + gtod->tod = htod.tod + kvm->arch.epoch; + gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx; + + if (gtod->tod < htod.tod) + gtod->epoch_idx += 1; + + preempt_enable(); +} + +static int kvm_s390_get_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_tod_clock gtod; + + memset(>od, 0, sizeof(gtod)); + + if (test_kvm_facility(kvm, 139)) + kvm_s390_get_tod_clock_ext(kvm, >od); + else + gtod.tod = kvm_s390_get_tod_clock_fast(kvm); + + if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) + return -EFAULT; + + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x, TOD base: 0x%llx", + gtod.epoch_idx, gtod.tod); + return 0; +} + static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) { u8 gtod_high = 0; @@ -954,6 +1020,9 @@ static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; switch (attr->attr) { + case KVM_S390_VM_TOD_EXT: + ret = kvm_s390_get_tod_ext(kvm, attr); + break; case KVM_S390_VM_TOD_HIGH: ret = kvm_s390_get_tod_high(kvm, attr); break; @@ -2369,6 +2438,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= ECA_VX; vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; } + if (test_kvm_facility(vcpu->kvm, 139)) + vcpu->arch.sie_block->ecd |= ECD_MEF; + vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) | SDNXC; vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; @@ -2855,6 +2927,35 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; } +void kvm_s390_set_tod_clock_ext(struct kvm *kvm, + const struct kvm_s390_vm_tod_clock *gtod) +{ + struct kvm_vcpu *vcpu; + struct kvm_s390_tod_clock_ext htod; + int i; + + mutex_lock(&kvm->lock); + preempt_disable(); + + get_tod_clock_ext((char *)&htod); + + kvm->arch.epoch = gtod->tod - htod.tod; + kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; + + if (kvm->arch.epoch > gtod->tod) + kvm->arch.epdx -= 1; + + kvm_s390_vcpu_block_all(kvm); + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.sie_block->epoch = kvm->arch.epoch; + vcpu->arch.sie_block->epdx = kvm->arch.epdx; + } + + kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); + mutex_unlock(&kvm->lock); +} + void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod) { struct kvm_vcpu *vcpu; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 6fedc8bc7a37..9f8fdd7b2311 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -272,6 +272,8 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); int handle_sthyi(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ +void kvm_s390_set_tod_clock_ext(struct kvm *kvm, + const struct kvm_s390_vm_tod_clock *gtod); void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 715c19c45d9a..d471910bd9ea 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -349,6 +349,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->eca |= scb_o->eca & ECA_IB; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) scb_s->eca |= scb_o->eca & ECA_CEI; + /* Epoch Extension */ + if (test_kvm_facility(vcpu->kvm, 139)) + scb_s->ecd |= scb_o->ecd & ECD_MEF; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -919,6 +922,13 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu, */ preempt_disable(); scb_s->epoch += vcpu->kvm->arch.epoch; + + if (scb_s->ecd & ECD_MEF) { + scb_s->epdx += vcpu->kvm->arch.epdx; + if (scb_s->epoch < vcpu->kvm->arch.epoch) + scb_s->epdx += 1; + } + preempt_enable(); } diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 181db5b8f8b3..601bfcf99e2a 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -81,6 +81,7 @@ static struct facility_def facility_defs[] = { 130, /* instruction-execution-protection */ 131, /* enhanced-SOP 2 and side-effect */ 138, /* configuration z/architecture mode (czam) */ + 139, /* multiple epoch facility */ 146, /* msa extension 8 */ -1 /* END */ } From 631aebfee8261d66597ce0f88ba046c77a0e415e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 3 Aug 2017 13:05:40 +0200 Subject: [PATCH 09/14] KVM: s390: sthyi: remove invalid guest write access handle_sthyi() always writes to guest memory if the sthyi function code is zero in order to fault in the page that later is written to. However a function code of zero does not necessarily mean that a write to guest memory happens: if the KVM host is running as a second level guest under z/VM 6.2 the sthyi instruction is indicated to be available to the KVM host, however if the instruction is executed it will always return with a return code that indicates "unsupported function code". In such a case handle_sthyi() must not write to guest memory. This means that the prior write access to fault in the guest page may result in invalid guest exceptions, and/or invalid data modification. In order to be architecture compliant simply remove the write_guest() call. Given that the guest assumed a write access anyway, this fix does not qualify for -stable. This just makes sure the sthyi handler is architecture compliant. Fixes: 95ca2cb57985 ("KVM: s390: Add sthyi emulation") Reviewed-by: Janosch Frank Signed-off-by: Heiko Carstens Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/sthyi.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index a2e5c24f47a7..395926b8c1ed 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -436,14 +436,6 @@ int handle_sthyi(struct kvm_vcpu *vcpu) if (addr & ~PAGE_MASK) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* - * If the page has not yet been faulted in, we want to do that - * now and not after all the expensive calculations. - */ - r = write_guest(vcpu, addr, reg2, &cc, 1); - if (r) - return kvm_s390_inject_prog_cond(vcpu, r); - sctns = (void *)get_zeroed_page(GFP_KERNEL); if (!sctns) return -ENOMEM; From 1bab1c02aff73eb5c3001e97f48e64ab6a80988a Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 29 Aug 2016 15:56:55 +0200 Subject: [PATCH 10/14] KVM: s390: expose no-DAT to guest and migration support The STFLE bit 147 indicates whether the ESSA no-DAT operation code is valid, the bit is not normally provided to the host; the host is instead provided with an SCLP bit that indicates whether guests can support the feature. This patch: * enables the STFLE bit in the guest if the corresponding SCLP bit is present in the host. * adds support for migrating the no-DAT bit in the PGSTEs * fixes the software interpretation of the ESSA instruction that is used when migrating, both for the new operation code and for the old "set stable", as per specifications. Signed-off-by: Claudio Imbrenda Reviewed-by: Christian Borntraeger Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/page-states.h | 2 +- arch/s390/kvm/kvm-s390.c | 8 ++++++-- arch/s390/kvm/priv.c | 6 +++++- arch/s390/mm/pgtable.c | 6 +++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index ca21b28a7b17..22b0f49e87c1 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -15,6 +15,6 @@ #define ESSA_SET_STABLE_IF_RESIDENT 6 #define ESSA_SET_STABLE_NODAT 7 -#define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT +#define ESSA_MAX ESSA_SET_STABLE_NODAT #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e65b7637cc45..84c069afc02f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1574,7 +1574,7 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (r < 0) pgstev = 0; /* save the value */ - res[i++] = (pgstev >> 24) & 0x3; + res[i++] = (pgstev >> 24) & 0x43; /* * if the next bit is too far away, stop. * if we reached the previous "next", find the next one @@ -1652,7 +1652,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, pgstev = bits[i]; pgstev = pgstev << 24; - mask &= _PGSTE_GPS_USAGE_MASK; + mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; set_pgste_bits(kvm->mm, hva, mask, pgstev); } srcu_read_unlock(&kvm->srcu, srcu_idx); @@ -1929,6 +1929,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) set_kvm_facility(kvm->arch.model.fac_mask, 74); set_kvm_facility(kvm->arch.model.fac_list, 74); + if (MACHINE_HAS_TLB_GUEST) { + set_kvm_facility(kvm->arch.model.fac_mask, 147); + set_kvm_facility(kvm->arch.model.fac_list, 147); + } kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8a1dac793d6b..91dc4a87ad61 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -988,6 +988,8 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) if (pgstev & _PGSTE_GPS_ZERO) res |= 1; } + if (pgstev & _PGSTE_GPS_NODAT) + res |= 0x20; vcpu->run->s.regs.gprs[r1] = res; /* * It is possible that all the normal 511 slots were full, in which case @@ -1027,7 +1029,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); /* Check for invalid operation request code */ orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; - if (orc > ESSA_MAX) + /* ORCs 0-6 are always valid */ + if (orc > (test_kvm_facility(vcpu->kvm, 147) ? ESSA_SET_STABLE_NODAT + : ESSA_SET_STABLE_IF_RESIDENT)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); if (likely(!vcpu->kvm->arch.migration_state)) { diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 8d018c76ee85..459716de5318 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -919,7 +919,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, case ESSA_GET_STATE: break; case ESSA_SET_STABLE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); pgstev |= _PGSTE_GPS_USAGE_STABLE; break; case ESSA_SET_UNUSED: @@ -965,6 +965,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, pgstev |= _PGSTE_GPS_USAGE_STABLE; } break; + case ESSA_SET_STABLE_NODAT: + pgstev &= ~_PGSTE_GPS_USAGE_MASK; + pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; + break; default: /* we should never get here! */ break; From 1935222dc2f22ea885e7a7ece6d1be586aafe4a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 29 Aug 2017 16:31:08 +0200 Subject: [PATCH 11/14] KVM: s390: we are always in czam mode Independent of the underlying hardware, kvm will now always handle SIGP SET ARCHITECTURE as if czam were enabled. Therefore, let's not only forward that bit but always set it. While at it, add a comment regarding STHYI. Signed-off-by: David Hildenbrand Message-Id: <20170829143108.14703-1-david@redhat.com> Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/s390/tools/gen_facilities.c | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 84c069afc02f..39115f5a38df 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1927,6 +1927,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); + /* we are always in czam mode - even on pre z14 machines */ + set_kvm_facility(kvm->arch.model.fac_mask, 138); + set_kvm_facility(kvm->arch.model.fac_list, 138); + /* we emulate STHYI in kvm */ set_kvm_facility(kvm->arch.model.fac_mask, 74); set_kvm_facility(kvm->arch.model.fac_list, 74); if (MACHINE_HAS_TLB_GUEST) { diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 601bfcf99e2a..19f12cef5bdf 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -80,7 +80,6 @@ static struct facility_def facility_defs[] = { 78, /* enhanced-DAT 2 */ 130, /* instruction-execution-protection */ 131, /* enhanced-SOP 2 and side-effect */ - 138, /* configuration z/architecture mode (czam) */ 139, /* multiple epoch facility */ 146, /* msa extension 8 */ -1 /* END */ From 8149fc07724a1eee65d772749935bd923ae4f686 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Aug 2017 18:06:00 +0200 Subject: [PATCH 12/14] KVM: s390: guestdbg: fix range check Looks like the "overflowing" range check is wrong. |=======b-------a=======| addr >= a || addr <= b Signed-off-by: David Hildenbrand Message-Id: <20170830160603.5452-2-david@redhat.com> Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/guestdbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index c2e0ddc1356e..bcbd86621d01 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -308,7 +308,7 @@ static inline int in_addr_range(u64 addr, u64 a, u64 b) return (addr >= a) && (addr <= b); else /* "overflowing" interval */ - return (addr <= a) && (addr >= b); + return (addr >= a) || (addr <= b); } #define end_of_range(bp_info) (bp_info->addr + bp_info->len - 1) From 3dbf0205b18c9db9ee86d886c94e731cc1fa7f09 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Aug 2017 18:06:01 +0200 Subject: [PATCH 13/14] KVM: s390: use WARN_ON_ONCE only for checking Move the real logic that always has to be executed out of the WARN_ON_ONCE. Signed-off-by: David Hildenbrand Message-Id: <20170830160603.5452-3-david@redhat.com> Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a619ddae610d..a832ad031cee 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2479,6 +2479,7 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, struct kvm_s390_mchk_info *mchk; union mci mci; __u64 cr14 = 0; /* upper bits are not used */ + int rc; mci.val = mcck_info->mcic; if (mci.sr) @@ -2496,12 +2497,13 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, if (mci.ck) { /* Inject the floating machine check */ inti.type = KVM_S390_MCHK; - WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti)); + rc = __inject_vm(vcpu->kvm, &inti); } else { /* Inject the machine check to specified vcpu */ irq.type = KVM_S390_MCHK; - WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq)); + rc = kvm_s390_inject_vcpu(vcpu, &irq); } + WARN_ON_ONCE(rc); } int kvm_set_routing_entry(struct kvm *kvm, From c95c895303ed13e883e0e6386e0b978174329210 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 30 Aug 2017 18:06:02 +0200 Subject: [PATCH 14/14] KVM: s390: vsie: cleanup mcck reinjection The machine check information is part of the vsie_page. Signed-off-by: David Hildenbrand Message-Id: <20170830160603.5452-4-david@redhat.com> Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d471910bd9ea..fbe46dd0e55d 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -809,8 +809,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; - struct mcck_volatile_info *mcck_info; - struct sie_page *sie_page; int rc; handle_last_fault(vcpu, vsie_page); @@ -834,9 +832,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (rc == -EINTR) { VCPU_EVENT(vcpu, 3, "%s", "machine check"); - sie_page = container_of(scb_s, struct sie_page, sie_block); - mcck_info = &sie_page->mcck_info; - kvm_s390_reinject_machine_check(vcpu, mcck_info); + kvm_s390_reinject_machine_check(vcpu, &vsie_page->mcck_info); return 0; }