linux-stable/arch/x86/mm/kasan_init_64.c
Sean Christopherson 1cfaac2400 x86/kasan: Populate shadow for shared chunk of the CPU entry area
Popuplate the shadow for the shared portion of the CPU entry area, i.e.
the read-only IDT mapping, during KASAN initialization.  A recent change
modified KASAN to map the per-CPU areas on-demand, but forgot to keep a
shadow for the common area that is shared amongst all CPUs.

Map the common area in KASAN init instead of letting idt_map_in_cea() do
the dirty work so that it Just Works in the unlikely event more shared
data is shoved into the CPU entry area.

The bug manifests as a not-present #PF when software attempts to lookup
an IDT entry, e.g. when KVM is handling IRQs on Intel CPUs (KVM performs
direct CALL to the IRQ handler to avoid the overhead of INTn):

 BUG: unable to handle page fault for address: fffffbc0000001d8
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 16c03a067 P4D 16c03a067 PUD 0
 Oops: 0000 [#1] PREEMPT SMP KASAN
 CPU: 5 PID: 901 Comm: repro Tainted: G        W          6.1.0-rc3+ #410
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
 RIP: 0010:kasan_check_range+0xdf/0x190
  vmx_handle_exit_irqoff+0x152/0x290 [kvm_intel]
  vcpu_run+0x1d89/0x2bd0 [kvm]
  kvm_arch_vcpu_ioctl_run+0x3ce/0xa70 [kvm]
  kvm_vcpu_ioctl+0x349/0x900 [kvm]
  __x64_sys_ioctl+0xb8/0xf0
  do_syscall_64+0x2b/0x50
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand")
Reported-by: syzbot+8cdd16fd5a6c0565e227@syzkaller.appspotmail.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221110203504.1985010-6-seanjc@google.com
2022-12-15 10:37:28 -08:00

456 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0
#define DISABLE_BRANCH_PROFILING
#define pr_fmt(fmt) "kasan: " fmt
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
#include <linux/memblock.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/vmalloc.h>
#include <asm/e820/types.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
static __init void *early_alloc(size_t size, int nid, bool should_panic)
{
void *ptr = memblock_alloc_try_nid(size, size,
__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!ptr && should_panic)
panic("%pS: Failed to allocate page, nid=%d from=%lx\n",
(void *)_RET_IP_, nid, __pa(MAX_DMA_ADDRESS));
return ptr;
}
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
unsigned long end, int nid)
{
pte_t *pte;
if (pmd_none(*pmd)) {
void *p;
if (boot_cpu_has(X86_FEATURE_PSE) &&
((end - addr) == PMD_SIZE) &&
IS_ALIGNED(addr, PMD_SIZE)) {
p = early_alloc(PMD_SIZE, nid, false);
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
return;
memblock_free(p, PMD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid, true);
pmd_populate_kernel(&init_mm, pmd, p);
}
pte = pte_offset_kernel(pmd, addr);
do {
pte_t entry;
void *p;
if (!pte_none(*pte))
continue;
p = early_alloc(PAGE_SIZE, nid, true);
entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
} while (pte++, addr += PAGE_SIZE, addr != end);
}
static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
unsigned long end, int nid)
{
pmd_t *pmd;
unsigned long next;
if (pud_none(*pud)) {
void *p;
if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
((end - addr) == PUD_SIZE) &&
IS_ALIGNED(addr, PUD_SIZE)) {
p = early_alloc(PUD_SIZE, nid, false);
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
return;
memblock_free(p, PUD_SIZE);
}
p = early_alloc(PAGE_SIZE, nid, true);
pud_populate(&init_mm, pud, p);
}
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (!pmd_large(*pmd))
kasan_populate_pmd(pmd, addr, next, nid);
} while (pmd++, addr = next, addr != end);
}
static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
unsigned long end, int nid)
{
pud_t *pud;
unsigned long next;
if (p4d_none(*p4d)) {
void *p = early_alloc(PAGE_SIZE, nid, true);
p4d_populate(&init_mm, p4d, p);
}
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (!pud_large(*pud))
kasan_populate_pud(pud, addr, next, nid);
} while (pud++, addr = next, addr != end);
}
static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
unsigned long end, int nid)
{
void *p;
p4d_t *p4d;
unsigned long next;
if (pgd_none(*pgd)) {
p = early_alloc(PAGE_SIZE, nid, true);
pgd_populate(&init_mm, pgd, p);
}
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
kasan_populate_p4d(p4d, addr, next, nid);
} while (p4d++, addr = next, addr != end);
}
static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
int nid)
{
pgd_t *pgd;
unsigned long next;
addr = addr & PAGE_MASK;
end = round_up(end, PAGE_SIZE);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
kasan_populate_pgd(pgd, addr, next, nid);
} while (pgd++, addr = next, addr != end);
}
static void __init map_range(struct range *range)
{
unsigned long start;
unsigned long end;
start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
}
static void __init clear_pgds(unsigned long start,
unsigned long end)
{
pgd_t *pgd;
/* See comment in kasan_init() */
unsigned long pgd_end = end & PGDIR_MASK;
for (; start < pgd_end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
if (pgtable_l5_enabled())
pgd_clear(pgd);
else
p4d_clear(p4d_offset(pgd, start));
}
pgd = pgd_offset_k(start);
for (; start < end; start += P4D_SIZE)
p4d_clear(p4d_offset(pgd, start));
}
static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
{
unsigned long p4d;
if (!pgtable_l5_enabled())
return (p4d_t *)pgd;
p4d = pgd_val(*pgd) & PTE_PFN_MASK;
p4d += __START_KERNEL_map - phys_base;
return (p4d_t *)p4d + p4d_index(addr);
}
static void __init kasan_early_p4d_populate(pgd_t *pgd,
unsigned long addr,
unsigned long end)
{
pgd_t pgd_entry;
p4d_t *p4d, p4d_entry;
unsigned long next;
if (pgd_none(*pgd)) {
pgd_entry = __pgd(_KERNPG_TABLE |
__pa_nodebug(kasan_early_shadow_p4d));
set_pgd(pgd, pgd_entry);
}
p4d = early_p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (!p4d_none(*p4d))
continue;
p4d_entry = __p4d(_KERNPG_TABLE |
__pa_nodebug(kasan_early_shadow_pud));
set_p4d(p4d, p4d_entry);
} while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
{
/* See comment in kasan_init() */
unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
unsigned long end = KASAN_SHADOW_END;
unsigned long next;
pgd += pgd_index(addr);
do {
next = pgd_addr_end(addr, end);
kasan_early_p4d_populate(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
static void __init kasan_shallow_populate_p4ds(pgd_t *pgd,
unsigned long addr,
unsigned long end)
{
p4d_t *p4d;
unsigned long next;
void *p;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none(*p4d)) {
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
p4d_populate(&init_mm, p4d, p);
}
} while (p4d++, addr = next, addr != end);
}
static void __init kasan_shallow_populate_pgds(void *start, void *end)
{
unsigned long addr, next;
pgd_t *pgd;
void *p;
addr = (unsigned long)start;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, (unsigned long)end);
if (pgd_none(*pgd)) {
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
pgd_populate(&init_mm, pgd, p);
}
/*
* we need to populate p4ds to be synced when running in
* four level mode - see sync_global_pgds_l4()
*/
kasan_shallow_populate_p4ds(pgd, addr, next);
} while (pgd++, addr = next, addr != (unsigned long)end);
}
void __init kasan_early_init(void)
{
int i;
pteval_t pte_val = __pa_nodebug(kasan_early_shadow_page) |
__PAGE_KERNEL | _PAGE_ENC;
pmdval_t pmd_val = __pa_nodebug(kasan_early_shadow_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_early_shadow_pmd) | _KERNPG_TABLE;
p4dval_t p4d_val = __pa_nodebug(kasan_early_shadow_pud) | _KERNPG_TABLE;
/* Mask out unsupported __PAGE_KERNEL bits: */
pte_val &= __default_kernel_pte_mask;
pmd_val &= __default_kernel_pte_mask;
pud_val &= __default_kernel_pte_mask;
p4d_val &= __default_kernel_pte_mask;
for (i = 0; i < PTRS_PER_PTE; i++)
kasan_early_shadow_pte[i] = __pte(pte_val);
for (i = 0; i < PTRS_PER_PMD; i++)
kasan_early_shadow_pmd[i] = __pmd(pmd_val);
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_early_shadow_pud[i] = __pud(pud_val);
for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
kasan_early_shadow_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
kasan_map_early_shadow(init_top_pgt);
}
static unsigned long kasan_mem_to_shadow_align_down(unsigned long va)
{
unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
return round_down(shadow, PAGE_SIZE);
}
static unsigned long kasan_mem_to_shadow_align_up(unsigned long va)
{
unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
return round_up(shadow, PAGE_SIZE);
}
void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
{
unsigned long shadow_start, shadow_end;
shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va);
shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size);
kasan_populate_shadow(shadow_start, shadow_end, nid);
}
void __init kasan_init(void)
{
unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end;
int i;
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
/*
* We use the same shadow offset for 4- and 5-level paging to
* facilitate boot-time switching between paging modes.
* As result in 5-level paging mode KASAN_SHADOW_START and
* KASAN_SHADOW_END are not aligned to PGD boundary.
*
* KASAN_SHADOW_START doesn't share PGD with anything else.
* We claim whole PGD entry to make things easier.
*
* KASAN_SHADOW_END lands in the last PGD entry and it collides with
* bunch of things like kernel code, modules, EFI mapping, etc.
* We need to take extra steps to not overwrite them.
*/
if (pgtable_l5_enabled()) {
void *ptr;
ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
__pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
}
load_cr3(early_top_pgt);
__flush_tlb_all();
clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
kasan_populate_early_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
if (pfn_mapped[i].end == 0)
break;
map_range(&pfn_mapped[i]);
}
shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE);
shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU);
shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE +
CPU_ENTRY_AREA_MAP_SIZE);
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
kasan_mem_to_shadow((void *)VMALLOC_START));
/*
* If we're in full vmalloc mode, don't back vmalloc space with early
* shadow pages. Instead, prepopulate pgds/p4ds so they are synced to
* the global table and we can populate the lower levels on demand.
*/
if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
kasan_shallow_populate_pgds(
kasan_mem_to_shadow((void *)VMALLOC_START),
kasan_mem_to_shadow((void *)VMALLOC_END));
else
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)VMALLOC_START),
kasan_mem_to_shadow((void *)VMALLOC_END));
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)VMALLOC_END + 1),
(void *)shadow_cea_begin);
/*
* Populate the shadow for the shared portion of the CPU entry area.
* Shadows for the per-CPU areas are mapped on-demand, as each CPU's
* area is randomly placed somewhere in the 512GiB range and mapping
* the entire 512GiB range is prohibitively expensive.
*/
kasan_populate_shadow(shadow_cea_begin,
shadow_cea_per_cpu_begin, 0);
kasan_populate_early_shadow((void *)shadow_cea_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
kasan_populate_early_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();
/*
* kasan_early_shadow_page has been used as early shadow memory, thus
* it may contain some garbage. Now we can clear and write protect it,
* since after the TLB flush no one should write to it.
*/
memset(kasan_early_shadow_page, 0, PAGE_SIZE);
for (i = 0; i < PTRS_PER_PTE; i++) {
pte_t pte;
pgprot_t prot;
prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC);
pgprot_val(prot) &= __default_kernel_pte_mask;
pte = __pte(__pa(kasan_early_shadow_page) | pgprot_val(prot));
set_pte(&kasan_early_shadow_pte[i], pte);
}
/* Flush TLBs again to be sure that write protection applied. */
__flush_tlb_all();
init_task.kasan_depth = 0;
pr_info("KernelAddressSanitizer initialized\n");
}