Merge branch 'for-next/kexec' into for-next/core

Significant steps along the road to leaving the MMU enabled during kexec
relocation.

* for-next/kexec:
  arm64: hibernate: add __force attribute to gfp_t casting
  arm64: kexec: arm64_relocate_new_kernel don't use x0 as temp
  arm64: kexec: arm64_relocate_new_kernel clean-ups and optimizations
  arm64: kexec: call kexec_image_info only once
  arm64: kexec: move relocation function setup
  arm64: trans_pgd: hibernate: idmap the single page that holds the copy page routines
  arm64: mm: Always update TCR_EL1 from __cpu_set_tcr_t0sz()
  arm64: trans_pgd: pass NULL instead of init_mm to *_populate functions
  arm64: trans_pgd: pass allocator trans_pgd_create_copy
  arm64: trans_pgd: make trans_pgd_map_page generic
  arm64: hibernate: move page handling function to new trans_pgd.c
  arm64: hibernate: variable pudp is used instead of pd4dp
  arm64: kexec: make dtb_mem always enabled
This commit is contained in:
Will Deacon 2021-02-12 15:03:53 +00:00
commit b374d0f981
9 changed files with 434 additions and 322 deletions

View file

@ -1132,6 +1132,10 @@ config CRASH_DUMP
For more details see Documentation/admin-guide/kdump/kdump.rst
config TRANS_TABLE
def_bool y
depends on HIBERNATION
config XEN_DOM0
def_bool y
depends on XEN

View file

@ -90,18 +90,19 @@ static inline void crash_prepare_suspend(void) {}
static inline void crash_post_resume(void) {}
#endif
#ifdef CONFIG_KEXEC_FILE
#define ARCH_HAS_KIMAGE_ARCH
struct kimage_arch {
void *dtb;
unsigned long dtb_mem;
phys_addr_t dtb_mem;
phys_addr_t kern_reloc;
/* Core ELF header buffer */
void *elf_headers;
unsigned long elf_headers_mem;
unsigned long elf_headers_sz;
};
#ifdef CONFIG_KEXEC_FILE
extern const struct kexec_file_ops kexec_image_ops;
struct kimage;

View file

@ -81,16 +81,15 @@ static inline bool __cpu_uses_extended_idmap_level(void)
}
/*
* Set TCR.T0SZ to its default value (based on VA_BITS)
* Ensure TCR.T0SZ is set to the provided value.
*/
static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
{
unsigned long tcr;
unsigned long tcr = read_sysreg(tcr_el1);
if (!__cpu_uses_extended_idmap())
if ((tcr & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET == t0sz)
return;
tcr = read_sysreg(tcr_el1);
tcr &= ~TCR_T0SZ_MASK;
tcr |= t0sz << TCR_T0SZ_OFFSET;
write_sysreg(tcr, tcr_el1);

View file

@ -0,0 +1,39 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2020, Microsoft Corporation.
* Pavel Tatashin <pasha.tatashin@soleen.com>
*/
#ifndef _ASM_TRANS_TABLE_H
#define _ASM_TRANS_TABLE_H
#include <linux/bits.h>
#include <linux/types.h>
#include <asm/pgtable-types.h>
/*
* trans_alloc_page
* - Allocator that should return exactly one zeroed page, if this
* allocator fails, trans_pgd_create_copy() and trans_pgd_map_page()
* return -ENOMEM error.
*
* trans_alloc_arg
* - Passed to trans_alloc_page as an argument
*/
struct trans_pgd_info {
void * (*trans_alloc_page)(void *arg);
void *trans_alloc_arg;
};
int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd,
unsigned long start, unsigned long end);
int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd,
void *page, unsigned long dst_addr, pgprot_t pgprot);
int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
unsigned long *t0sz, void *page);
#endif /* _ASM_TRANS_TABLE_H */

View file

@ -16,7 +16,6 @@
#define pr_fmt(x) "hibernate: " x
#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/sched.h>
#include <linux/suspend.h>
@ -31,13 +30,12 @@
#include <asm/memory.h>
#include <asm/mmu_context.h>
#include <asm/mte.h>
#include <asm/pgalloc.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sections.h>
#include <asm/smp.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
#include <asm/sysreg.h>
#include <asm/trans_pgd.h>
#include <asm/virt.h>
/*
@ -178,52 +176,9 @@ int arch_hibernation_header_restore(void *addr)
}
EXPORT_SYMBOL(arch_hibernation_header_restore);
static int trans_pgd_map_page(pgd_t *trans_pgd, void *page,
unsigned long dst_addr,
pgprot_t pgprot)
static void *hibernate_page_alloc(void *arg)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
pgdp = pgd_offset_pgd(trans_pgd, dst_addr);
if (pgd_none(READ_ONCE(*pgdp))) {
pudp = (void *)get_safe_page(GFP_ATOMIC);
if (!pudp)
return -ENOMEM;
pgd_populate(&init_mm, pgdp, pudp);
}
p4dp = p4d_offset(pgdp, dst_addr);
if (p4d_none(READ_ONCE(*p4dp))) {
pudp = (void *)get_safe_page(GFP_ATOMIC);
if (!pudp)
return -ENOMEM;
p4d_populate(&init_mm, p4dp, pudp);
}
pudp = pud_offset(p4dp, dst_addr);
if (pud_none(READ_ONCE(*pudp))) {
pmdp = (void *)get_safe_page(GFP_ATOMIC);
if (!pmdp)
return -ENOMEM;
pud_populate(&init_mm, pudp, pmdp);
}
pmdp = pmd_offset(pudp, dst_addr);
if (pmd_none(READ_ONCE(*pmdp))) {
ptep = (void *)get_safe_page(GFP_ATOMIC);
if (!ptep)
return -ENOMEM;
pmd_populate_kernel(&init_mm, pmdp, ptep);
}
ptep = pte_offset_kernel(pmdp, dst_addr);
set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC));
return 0;
return (void *)get_safe_page((__force gfp_t)(unsigned long)arg);
}
/*
@ -239,11 +194,16 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page,
* page system.
*/
static int create_safe_exec_page(void *src_start, size_t length,
unsigned long dst_addr,
phys_addr_t *phys_dst_addr)
{
struct trans_pgd_info trans_info = {
.trans_alloc_page = hibernate_page_alloc,
.trans_alloc_arg = (__force void *)GFP_ATOMIC,
};
void *page = (void *)get_safe_page(GFP_ATOMIC);
pgd_t *trans_pgd;
phys_addr_t trans_ttbr0;
unsigned long t0sz;
int rc;
if (!page)
@ -251,13 +211,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
memcpy(page, src_start, length);
__flush_icache_range((unsigned long)page, (unsigned long)page + length);
trans_pgd = (void *)get_safe_page(GFP_ATOMIC);
if (!trans_pgd)
return -ENOMEM;
rc = trans_pgd_map_page(trans_pgd, page, dst_addr,
PAGE_KERNEL_EXEC);
rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page);
if (rc)
return rc;
@ -270,12 +224,15 @@ static int create_safe_exec_page(void *src_start, size_t length,
* page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI
* runtime services), while for a userspace-driven test_resume cycle it
* points to userspace page tables (and we must point it at a zero page
* ourselves). Elsewhere we only (un)install the idmap with preemption
* disabled, so T0SZ should be as required regardless.
* ourselves).
*
* We change T0SZ as part of installing the idmap. This is undone by
* cpu_uninstall_idmap() in __cpu_suspend_exit().
*/
cpu_set_reserved_ttbr0();
local_flush_tlb_all();
write_sysreg(phys_to_ttbr(virt_to_phys(trans_pgd)), ttbr0_el1);
__cpu_set_tcr_t0sz(t0sz);
write_sysreg(trans_ttbr0, ttbr0_el1);
isb();
*phys_dst_addr = virt_to_phys(page);
@ -462,182 +419,6 @@ int swsusp_arch_suspend(void)
return ret;
}
static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
{
pte_t pte = READ_ONCE(*src_ptep);
if (pte_valid(pte)) {
/*
* Resume will overwrite areas that may be marked
* read only (code, rodata). Clear the RDONLY bit from
* the temporary mappings we use during restore.
*/
set_pte(dst_ptep, pte_mkwrite(pte));
} else if (debug_pagealloc_enabled() && !pte_none(pte)) {
/*
* debug_pagealloc will removed the PTE_VALID bit if
* the page isn't in use by the resume kernel. It may have
* been in use by the original kernel, in which case we need
* to put it back in our copy to do the restore.
*
* Before marking this entry valid, check the pfn should
* be mapped.
*/
BUG_ON(!pfn_valid(pte_pfn(pte)));
set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte)));
}
}
static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start,
unsigned long end)
{
pte_t *src_ptep;
pte_t *dst_ptep;
unsigned long addr = start;
dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC);
if (!dst_ptep)
return -ENOMEM;
pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep);
dst_ptep = pte_offset_kernel(dst_pmdp, start);
src_ptep = pte_offset_kernel(src_pmdp, start);
do {
_copy_pte(dst_ptep, src_ptep, addr);
} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
return 0;
}
static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start,
unsigned long end)
{
pmd_t *src_pmdp;
pmd_t *dst_pmdp;
unsigned long next;
unsigned long addr = start;
if (pud_none(READ_ONCE(*dst_pudp))) {
dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC);
if (!dst_pmdp)
return -ENOMEM;
pud_populate(&init_mm, dst_pudp, dst_pmdp);
}
dst_pmdp = pmd_offset(dst_pudp, start);
src_pmdp = pmd_offset(src_pudp, start);
do {
pmd_t pmd = READ_ONCE(*src_pmdp);
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
continue;
if (pmd_table(pmd)) {
if (copy_pte(dst_pmdp, src_pmdp, addr, next))
return -ENOMEM;
} else {
set_pmd(dst_pmdp,
__pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
}
} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
return 0;
}
static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start,
unsigned long end)
{
pud_t *dst_pudp;
pud_t *src_pudp;
unsigned long next;
unsigned long addr = start;
if (p4d_none(READ_ONCE(*dst_p4dp))) {
dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!dst_pudp)
return -ENOMEM;
p4d_populate(&init_mm, dst_p4dp, dst_pudp);
}
dst_pudp = pud_offset(dst_p4dp, start);
src_pudp = pud_offset(src_p4dp, start);
do {
pud_t pud = READ_ONCE(*src_pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
continue;
if (pud_table(pud)) {
if (copy_pmd(dst_pudp, src_pudp, addr, next))
return -ENOMEM;
} else {
set_pud(dst_pudp,
__pud(pud_val(pud) & ~PUD_SECT_RDONLY));
}
} while (dst_pudp++, src_pudp++, addr = next, addr != end);
return 0;
}
static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
unsigned long end)
{
p4d_t *dst_p4dp;
p4d_t *src_p4dp;
unsigned long next;
unsigned long addr = start;
dst_p4dp = p4d_offset(dst_pgdp, start);
src_p4dp = p4d_offset(src_pgdp, start);
do {
next = p4d_addr_end(addr, end);
if (p4d_none(READ_ONCE(*src_p4dp)))
continue;
if (copy_pud(dst_p4dp, src_p4dp, addr, next))
return -ENOMEM;
} while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
return 0;
}
static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start,
unsigned long end)
{
unsigned long next;
unsigned long addr = start;
pgd_t *src_pgdp = pgd_offset_k(start);
dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
do {
next = pgd_addr_end(addr, end);
if (pgd_none(READ_ONCE(*src_pgdp)))
continue;
if (copy_p4d(dst_pgdp, src_pgdp, addr, next))
return -ENOMEM;
} while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
return 0;
}
static int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start,
unsigned long end)
{
int rc;
pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC);
if (!trans_pgd) {
pr_err("Failed to allocate memory for temporary page tables.\n");
return -ENOMEM;
}
rc = copy_page_tables(trans_pgd, start, end);
if (!rc)
*dst_pgdp = trans_pgd;
return rc;
}
/*
* Setup then Resume from the hibernate image using swsusp_arch_suspend_exit().
*
@ -650,16 +431,20 @@ int swsusp_arch_resume(void)
void *zero_page;
size_t exit_size;
pgd_t *tmp_pg_dir;
phys_addr_t phys_hibernate_exit;
void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
void *, phys_addr_t, phys_addr_t);
struct trans_pgd_info trans_info = {
.trans_alloc_page = hibernate_page_alloc,
.trans_alloc_arg = (void *)GFP_ATOMIC,
};
/*
* Restoring the memory image will overwrite the ttbr1 page tables.
* Create a second copy of just the linear map, and use this when
* restoring.
*/
rc = trans_pgd_create_copy(&tmp_pg_dir, PAGE_OFFSET, PAGE_END);
rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET,
PAGE_END);
if (rc)
return rc;
@ -673,19 +458,13 @@ int swsusp_arch_resume(void)
return -ENOMEM;
}
/*
* Locate the exit code in the bottom-but-one page, so that *NULL
* still has disastrous affects.
*/
hibernate_exit = (void *)PAGE_SIZE;
exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start;
/*
* Copy swsusp_arch_suspend_exit() to a safe page. This will generate
* a new set of ttbr0 page tables and load them.
*/
rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size,
(unsigned long)hibernate_exit,
&phys_hibernate_exit);
(phys_addr_t *)&hibernate_exit);
if (rc) {
pr_err("Failed to create safe executable page for hibernate_exit code.\n");
return rc;
@ -704,7 +483,7 @@ int swsusp_arch_resume(void)
* We can skip this step if we booted at EL1, or are running with VHE.
*/
if (el2_reset_needed()) {
phys_addr_t el2_vectors = phys_hibernate_exit; /* base */
phys_addr_t el2_vectors = (phys_addr_t)hibernate_exit;
el2_vectors += hibernate_el2_vectors -
__hibernate_exit_text_start; /* offset */

View file

@ -42,6 +42,7 @@ static void _kexec_image_info(const char *func, int line,
pr_debug(" start: %lx\n", kimage->start);
pr_debug(" head: %lx\n", kimage->head);
pr_debug(" nr_segments: %lu\n", kimage->nr_segments);
pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc);
for (i = 0; i < kimage->nr_segments; i++) {
pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n",
@ -58,6 +59,23 @@ void machine_kexec_cleanup(struct kimage *kimage)
/* Empty routine needed to avoid build errors. */
}
int machine_kexec_post_load(struct kimage *kimage)
{
void *reloc_code = page_to_virt(kimage->control_code_page);
memcpy(reloc_code, arm64_relocate_new_kernel,
arm64_relocate_new_kernel_size);
kimage->arch.kern_reloc = __pa(reloc_code);
kexec_image_info(kimage);
/* Flush the reloc_code in preparation for its execution. */
__flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size);
flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code +
arm64_relocate_new_kernel_size);
return 0;
}
/**
* machine_kexec_prepare - Prepare for a kexec reboot.
*
@ -67,8 +85,6 @@ void machine_kexec_cleanup(struct kimage *kimage)
*/
int machine_kexec_prepare(struct kimage *kimage)
{
kexec_image_info(kimage);
if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) {
pr_err("Can't kexec: CPUs are stuck in the kernel.\n");
return -EBUSY;
@ -143,8 +159,6 @@ static void kexec_segment_flush(const struct kimage *kimage)
*/
void machine_kexec(struct kimage *kimage)
{
phys_addr_t reboot_code_buffer_phys;
void *reboot_code_buffer;
bool in_kexec_crash = (kimage == kexec_crash_image);
bool stuck_cpus = cpus_are_stuck_in_kernel();
@ -155,31 +169,6 @@ void machine_kexec(struct kimage *kimage)
WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()),
"Some CPUs may be stale, kdump will be unreliable.\n");
reboot_code_buffer_phys = page_to_phys(kimage->control_code_page);
reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
kexec_image_info(kimage);
/*
* Copy arm64_relocate_new_kernel to the reboot_code_buffer for use
* after the kernel is shut down.
*/
memcpy(reboot_code_buffer, arm64_relocate_new_kernel,
arm64_relocate_new_kernel_size);
/* Flush the reboot_code_buffer in preparation for its execution. */
__flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size);
/*
* Although we've killed off the secondary CPUs, we don't update
* the online mask if we're handling a crash kernel and consequently
* need to avoid flush_icache_range(), which will attempt to IPI
* the offline CPUs. Therefore, we must use the __* variant here.
*/
__flush_icache_range((uintptr_t)reboot_code_buffer,
(uintptr_t)reboot_code_buffer +
arm64_relocate_new_kernel_size);
/* Flush the kimage list and its buffers. */
kexec_list_flush(kimage);
@ -193,7 +182,7 @@ void machine_kexec(struct kimage *kimage)
/*
* cpu_soft_restart will shutdown the MMU, disable data caches, then
* transfer control to the reboot_code_buffer which contains a copy of
* transfer control to the kern_reloc which contains a copy of
* the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel
* uses physical addressing to relocate the new image to its final
* position and transfers control to the image entry point when the
@ -203,12 +192,8 @@ void machine_kexec(struct kimage *kimage)
* userspace (kexec-tools).
* In kexec_file case, the kernel starts directly without purgatory.
*/
cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start,
#ifdef CONFIG_KEXEC_FILE
kimage->arch.dtb_mem);
#else
0);
#endif
cpu_soft_restart(kimage->arch.kern_reloc, kimage->head, kimage->start,
kimage->arch.dtb_mem);
BUG(); /* Should never get here. */
}

View file

@ -17,28 +17,24 @@
/*
* arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it.
*
* The memory that the old kernel occupies may be overwritten when coping the
* The memory that the old kernel occupies may be overwritten when copying the
* new image to its final location. To assure that the
* arm64_relocate_new_kernel routine which does that copy is not overwritten,
* all code and data needed by arm64_relocate_new_kernel must be between the
* symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The
* machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec
* control_code_page, a special page which has been set up to be preserved
* during the copy operation.
* safe memory that has been set up to be preserved during the copy operation.
*/
SYM_CODE_START(arm64_relocate_new_kernel)
/* Setup the list loop variables. */
mov x18, x2 /* x18 = dtb address */
mov x17, x1 /* x17 = kimage_start */
mov x16, x0 /* x16 = kimage_head */
raw_dcache_line_size x15, x0 /* x15 = dcache line size */
mov x14, xzr /* x14 = entry ptr */
mov x13, xzr /* x13 = copy dest */
/* Check if the new image needs relocation. */
tbnz x16, IND_DONE_BIT, .Ldone
raw_dcache_line_size x15, x1 /* x15 = dcache line size */
.Lloop:
and x12, x16, PAGE_MASK /* x12 = addr */
@ -47,44 +43,28 @@ SYM_CODE_START(arm64_relocate_new_kernel)
tbz x16, IND_SOURCE_BIT, .Ltest_indirection
/* Invalidate dest page to PoC. */
mov x0, x13
add x20, x0, #PAGE_SIZE
mov x2, x13
add x20, x2, #PAGE_SIZE
sub x1, x15, #1
bic x0, x0, x1
2: dc ivac, x0
add x0, x0, x15
cmp x0, x20
bic x2, x2, x1
2: dc ivac, x2
add x2, x2, x15
cmp x2, x20
b.lo 2b
dsb sy
mov x20, x13
mov x21, x12
copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7
/* dest += PAGE_SIZE */
add x13, x13, PAGE_SIZE
copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8
b .Lnext
.Ltest_indirection:
tbz x16, IND_INDIRECTION_BIT, .Ltest_destination
/* ptr = addr */
mov x14, x12
mov x14, x12 /* ptr = addr */
b .Lnext
.Ltest_destination:
tbz x16, IND_DESTINATION_BIT, .Lnext
/* dest = addr */
mov x13, x12
mov x13, x12 /* dest = addr */
.Lnext:
/* entry = *ptr++ */
ldr x16, [x14], #8
/* while (!(entry & DONE)) */
tbz x16, IND_DONE_BIT, .Lloop
ldr x16, [x14], #8 /* entry = *ptr++ */
tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */
.Ldone:
/* wait for writes from copy_page to finish */
dsb nsh

View file

@ -6,6 +6,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_ARM64_MTE) += mteswap.o

324
arch/arm64/mm/trans_pgd.c Normal file
View file

@ -0,0 +1,324 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Transitional page tables for kexec and hibernate
*
* This file derived from: arch/arm64/kernel/hibernate.c
*
* Copyright (c) 2020, Microsoft Corporation.
* Pavel Tatashin <pasha.tatashin@soleen.com>
*
*/
/*
* Transitional tables are used during system transferring from one world to
* another: such as during hibernate restore, and kexec reboots. During these
* phases one cannot rely on page table not being overwritten. This is because
* hibernate and kexec can overwrite the current page tables during transition.
*/
#include <asm/trans_pgd.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <linux/suspend.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
static void *trans_alloc(struct trans_pgd_info *info)
{
return info->trans_alloc_page(info->trans_alloc_arg);
}
static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
{
pte_t pte = READ_ONCE(*src_ptep);
if (pte_valid(pte)) {
/*
* Resume will overwrite areas that may be marked
* read only (code, rodata). Clear the RDONLY bit from
* the temporary mappings we use during restore.
*/
set_pte(dst_ptep, pte_mkwrite(pte));
} else if (debug_pagealloc_enabled() && !pte_none(pte)) {
/*
* debug_pagealloc will removed the PTE_VALID bit if
* the page isn't in use by the resume kernel. It may have
* been in use by the original kernel, in which case we need
* to put it back in our copy to do the restore.
*
* Before marking this entry valid, check the pfn should
* be mapped.
*/
BUG_ON(!pfn_valid(pte_pfn(pte)));
set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte)));
}
}
static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
pmd_t *src_pmdp, unsigned long start, unsigned long end)
{
pte_t *src_ptep;
pte_t *dst_ptep;
unsigned long addr = start;
dst_ptep = trans_alloc(info);
if (!dst_ptep)
return -ENOMEM;
pmd_populate_kernel(NULL, dst_pmdp, dst_ptep);
dst_ptep = pte_offset_kernel(dst_pmdp, start);
src_ptep = pte_offset_kernel(src_pmdp, start);
do {
_copy_pte(dst_ptep, src_ptep, addr);
} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
return 0;
}
static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
pud_t *src_pudp, unsigned long start, unsigned long end)
{
pmd_t *src_pmdp;
pmd_t *dst_pmdp;
unsigned long next;
unsigned long addr = start;
if (pud_none(READ_ONCE(*dst_pudp))) {
dst_pmdp = trans_alloc(info);
if (!dst_pmdp)
return -ENOMEM;
pud_populate(NULL, dst_pudp, dst_pmdp);
}
dst_pmdp = pmd_offset(dst_pudp, start);
src_pmdp = pmd_offset(src_pudp, start);
do {
pmd_t pmd = READ_ONCE(*src_pmdp);
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
continue;
if (pmd_table(pmd)) {
if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
return -ENOMEM;
} else {
set_pmd(dst_pmdp,
__pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
}
} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
return 0;
}
static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
p4d_t *src_p4dp, unsigned long start,
unsigned long end)
{
pud_t *dst_pudp;
pud_t *src_pudp;
unsigned long next;
unsigned long addr = start;
if (p4d_none(READ_ONCE(*dst_p4dp))) {
dst_pudp = trans_alloc(info);
if (!dst_pudp)
return -ENOMEM;
p4d_populate(NULL, dst_p4dp, dst_pudp);
}
dst_pudp = pud_offset(dst_p4dp, start);
src_pudp = pud_offset(src_p4dp, start);
do {
pud_t pud = READ_ONCE(*src_pudp);
next = pud_addr_end(addr, end);
if (pud_none(pud))
continue;
if (pud_table(pud)) {
if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
return -ENOMEM;
} else {
set_pud(dst_pudp,
__pud(pud_val(pud) & ~PUD_SECT_RDONLY));
}
} while (dst_pudp++, src_pudp++, addr = next, addr != end);
return 0;
}
static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp,
pgd_t *src_pgdp, unsigned long start,
unsigned long end)
{
p4d_t *dst_p4dp;
p4d_t *src_p4dp;
unsigned long next;
unsigned long addr = start;
dst_p4dp = p4d_offset(dst_pgdp, start);
src_p4dp = p4d_offset(src_pgdp, start);
do {
next = p4d_addr_end(addr, end);
if (p4d_none(READ_ONCE(*src_p4dp)))
continue;
if (copy_pud(info, dst_p4dp, src_p4dp, addr, next))
return -ENOMEM;
} while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
return 0;
}
static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp,
unsigned long start, unsigned long end)
{
unsigned long next;
unsigned long addr = start;
pgd_t *src_pgdp = pgd_offset_k(start);
dst_pgdp = pgd_offset_pgd(dst_pgdp, start);
do {
next = pgd_addr_end(addr, end);
if (pgd_none(READ_ONCE(*src_pgdp)))
continue;
if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next))
return -ENOMEM;
} while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
return 0;
}
/*
* Create trans_pgd and copy linear map.
* info: contains allocator and its argument
* dst_pgdp: new page table that is created, and to which map is copied.
* start: Start of the interval (inclusive).
* end: End of the interval (exclusive).
*
* Returns 0 on success, and -ENOMEM on failure.
*/
int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp,
unsigned long start, unsigned long end)
{
int rc;
pgd_t *trans_pgd = trans_alloc(info);
if (!trans_pgd) {
pr_err("Failed to allocate memory for temporary page tables.\n");
return -ENOMEM;
}
rc = copy_page_tables(info, trans_pgd, start, end);
if (!rc)
*dst_pgdp = trans_pgd;
return rc;
}
/*
* Add map entry to trans_pgd for a base-size page at PTE level.
* info: contains allocator and its argument
* trans_pgd: page table in which new map is added.
* page: page to be mapped.
* dst_addr: new VA address for the page
* pgprot: protection for the page.
*
* Returns 0 on success, and -ENOMEM on failure.
*/
int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd,
void *page, unsigned long dst_addr, pgprot_t pgprot)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
pgdp = pgd_offset_pgd(trans_pgd, dst_addr);
if (pgd_none(READ_ONCE(*pgdp))) {
p4dp = trans_alloc(info);
if (!pgdp)
return -ENOMEM;
pgd_populate(NULL, pgdp, p4dp);
}
p4dp = p4d_offset(pgdp, dst_addr);
if (p4d_none(READ_ONCE(*p4dp))) {
pudp = trans_alloc(info);
if (!pudp)
return -ENOMEM;
p4d_populate(NULL, p4dp, pudp);
}
pudp = pud_offset(p4dp, dst_addr);
if (pud_none(READ_ONCE(*pudp))) {
pmdp = trans_alloc(info);
if (!pmdp)
return -ENOMEM;
pud_populate(NULL, pudp, pmdp);
}
pmdp = pmd_offset(pudp, dst_addr);
if (pmd_none(READ_ONCE(*pmdp))) {
ptep = trans_alloc(info);
if (!ptep)
return -ENOMEM;
pmd_populate_kernel(NULL, pmdp, ptep);
}
ptep = pte_offset_kernel(pmdp, dst_addr);
set_pte(ptep, pfn_pte(virt_to_pfn(page), pgprot));
return 0;
}
/*
* The page we want to idmap may be outside the range covered by VA_BITS that
* can be built using the kernel's p?d_populate() helpers. As a one off, for a
* single page, we build these page tables bottom up and just assume that will
* need the maximum T0SZ.
*
* Returns 0 on success, and -ENOMEM on failure.
* On success trans_ttbr0 contains page table with idmapped page, t0sz is set to
* maximum T0SZ for this page.
*/
int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0,
unsigned long *t0sz, void *page)
{
phys_addr_t dst_addr = virt_to_phys(page);
unsigned long pfn = __phys_to_pfn(dst_addr);
int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47;
int bits_mapped = PAGE_SHIFT - 4;
unsigned long level_mask, prev_level_entry, *levels[4];
int this_level, index, level_lsb, level_msb;
dst_addr &= PAGE_MASK;
prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC));
for (this_level = 3; this_level >= 0; this_level--) {
levels[this_level] = trans_alloc(info);
if (!levels[this_level])
return -ENOMEM;
level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level);
level_msb = min(level_lsb + bits_mapped, max_msb);
level_mask = GENMASK_ULL(level_msb, level_lsb);
index = (dst_addr & level_mask) >> level_lsb;
*(levels[this_level] + index) = prev_level_entry;
pfn = virt_to_pfn(levels[this_level]);
prev_level_entry = pte_val(pfn_pte(pfn,
__pgprot(PMD_TYPE_TABLE)));
if (level_msb == max_msb)
break;
}
*trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn));
*t0sz = TCR_T0SZ(max_msb + 1);
return 0;
}