2019-05-27 06:55:01 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2016-04-29 13:25:58 +00:00
|
|
|
/*
|
|
|
|
* Page table handling routines for radix page table.
|
|
|
|
*
|
|
|
|
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
|
|
|
|
*/
|
2017-08-30 07:41:29 +00:00
|
|
|
|
|
|
|
#define pr_fmt(fmt) "radix-mmu: " fmt
|
|
|
|
|
2019-06-10 03:08:17 +00:00
|
|
|
#include <linux/io.h>
|
2017-08-30 07:41:29 +00:00
|
|
|
#include <linux/kernel.h>
|
2017-02-03 23:16:44 +00:00
|
|
|
#include <linux/sched/mm.h>
|
2016-04-29 13:25:58 +00:00
|
|
|
#include <linux/memblock.h>
|
2018-09-13 16:09:06 +00:00
|
|
|
#include <linux/of.h>
|
2016-04-29 13:25:58 +00:00
|
|
|
#include <linux/of_fdt.h>
|
2017-06-28 17:04:09 +00:00
|
|
|
#include <linux/mm.h>
|
2019-12-01 01:56:37 +00:00
|
|
|
#include <linux/hugetlb.h>
|
2017-08-30 07:41:17 +00:00
|
|
|
#include <linux/string_helpers.h>
|
2020-07-09 13:19:25 +00:00
|
|
|
#include <linux/memory.h>
|
2016-04-29 13:25:58 +00:00
|
|
|
|
|
|
|
#include <asm/pgalloc.h>
|
powerpc/64s/radix: Boot-time NULL pointer protection using a guard-PID
This change restores and formalises the behaviour that access to NULL
or other user addresses by the kernel during boot should fault rather
than succeed and modify memory. This was inadvertently broken when
fixing another bug, because it was previously not well defined and
only worked by chance.
powerpc/64s/radix uses high address bits to select an address space
"quadrant", which determines which PID and LPID are used to translate
the rest of the address (effective PID, effective LPID). The kernel
mapping at 0xC... selects quadrant 3, which uses PID=0 and LPID=0. So
the kernel page tables are installed in the PID 0 process table entry.
An address at 0x0... selects quadrant 0, which uses PID=PIDR for
translating the rest of the address (that is, it uses the value of the
PIDR register as the effective PID). If PIDR=0, then the translation
is performed with the PID 0 process table entry page tables. This is
the kernel mapping, so we effectively get another copy of the kernel
address space at 0. A NULL pointer access will access physical memory
address 0.
To prevent duplicating the kernel address space in quadrant 0, this
patch allocates a guard PID containing no translations, and
initializes PIDR with this during boot, before the MMU is switched on.
Any kernel access to quadrant 0 will use this guard PID for
translation and find no valid mappings, and therefore fault.
After boot, this PID will be switchd away to user context PIDs, but
those contain user mappings (and usually NULL pointer protection)
rather than kernel mapping, which is much safer (and by design). It
may be in future this is tightened further, which the guard PID could
be used for.
Commit 371b8044 ("powerpc/64s: Initialize ISAv3 MMU registers before
setting partition table"), introduced this problem because it zeroes
PIDR at boot. However previously the value was inherited from firmware
or kexec, which is not robust and can be zero (e.g., mambo).
Fixes: 371b80447ff3 ("powerpc/64s: Initialize ISAv3 MMU registers before setting partition table")
Cc: stable@vger.kernel.org # v4.15+
Reported-by: Florian Weimer <fweimer@redhat.com>
Tested-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-02-07 01:20:02 +00:00
|
|
|
#include <asm/mmu_context.h>
|
2016-04-29 13:25:58 +00:00
|
|
|
#include <asm/dma.h>
|
|
|
|
#include <asm/machdep.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm/firmware.h>
|
2016-12-14 02:36:51 +00:00
|
|
|
#include <asm/powernv.h>
|
2017-06-06 05:48:57 +00:00
|
|
|
#include <asm/sections.h>
|
2020-03-02 01:04:10 +00:00
|
|
|
#include <asm/smp.h>
|
2017-04-11 05:23:25 +00:00
|
|
|
#include <asm/trace.h>
|
2019-04-18 06:51:24 +00:00
|
|
|
#include <asm/uaccess.h>
|
2019-08-22 03:48:36 +00:00
|
|
|
#include <asm/ultravisor.h>
|
2022-09-26 07:57:23 +00:00
|
|
|
#include <asm/set_memory.h>
|
2016-04-29 13:25:58 +00:00
|
|
|
|
2016-04-29 13:26:30 +00:00
|
|
|
#include <trace/events/thp.h>
|
|
|
|
|
2022-09-26 07:57:26 +00:00
|
|
|
#include <mm/mmu_decl.h>
|
|
|
|
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
unsigned int mmu_base_pid;
|
|
|
|
|
2018-02-13 15:08:24 +00:00
|
|
|
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
|
|
|
|
unsigned long region_start, unsigned long region_end)
|
2016-04-29 13:25:58 +00:00
|
|
|
{
|
2019-03-08 00:30:48 +00:00
|
|
|
phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
|
|
|
|
phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
|
2019-03-12 06:30:31 +00:00
|
|
|
void *ptr;
|
2016-04-29 13:25:58 +00:00
|
|
|
|
2019-03-08 00:30:48 +00:00
|
|
|
if (region_start)
|
|
|
|
min_addr = region_start;
|
|
|
|
if (region_end)
|
|
|
|
max_addr = region_end;
|
2018-02-13 15:08:24 +00:00
|
|
|
|
2019-03-12 06:30:31 +00:00
|
|
|
ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
|
|
|
|
|
|
|
|
if (!ptr)
|
|
|
|
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
|
|
|
|
__func__, size, size, nid, &min_addr, &max_addr);
|
|
|
|
|
|
|
|
return ptr;
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
|
|
|
|
2020-07-09 13:19:22 +00:00
|
|
|
/*
|
|
|
|
* When allocating pud or pmd pointers, we allocate a complete page
|
|
|
|
* of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
|
|
|
|
* is to ensure that the page obtained from the memblock allocator
|
|
|
|
* can be completely used as page table page and can be freed
|
|
|
|
* correctly when the page table entries are removed.
|
|
|
|
*/
|
2018-02-13 15:08:23 +00:00
|
|
|
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
|
2016-04-29 13:25:58 +00:00
|
|
|
pgprot_t flags,
|
2018-02-13 15:08:24 +00:00
|
|
|
unsigned int map_page_size,
|
|
|
|
int nid,
|
|
|
|
unsigned long region_start, unsigned long region_end)
|
2016-04-29 13:25:58 +00:00
|
|
|
{
|
2018-02-13 15:08:24 +00:00
|
|
|
unsigned long pfn = pa >> PAGE_SHIFT;
|
2018-02-13 15:08:23 +00:00
|
|
|
pgd_t *pgdp;
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d_t *p4dp;
|
2018-02-13 15:08:23 +00:00
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
pgdp = pgd_offset_k(ea);
|
2020-06-04 23:46:44 +00:00
|
|
|
p4dp = p4d_offset(pgdp, ea);
|
|
|
|
if (p4d_none(*p4dp)) {
|
2020-07-09 13:19:22 +00:00
|
|
|
pudp = early_alloc_pgtable(PAGE_SIZE, nid,
|
|
|
|
region_start, region_end);
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d_populate(&init_mm, p4dp, pudp);
|
2018-02-13 15:08:23 +00:00
|
|
|
}
|
2020-06-04 23:46:44 +00:00
|
|
|
pudp = pud_offset(p4dp, ea);
|
2018-02-13 15:08:23 +00:00
|
|
|
if (map_page_size == PUD_SIZE) {
|
|
|
|
ptep = (pte_t *)pudp;
|
|
|
|
goto set_the_pte;
|
|
|
|
}
|
|
|
|
if (pud_none(*pudp)) {
|
2020-07-09 13:19:22 +00:00
|
|
|
pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
|
|
|
|
region_end);
|
2018-02-13 15:08:23 +00:00
|
|
|
pud_populate(&init_mm, pudp, pmdp);
|
|
|
|
}
|
|
|
|
pmdp = pmd_offset(pudp, ea);
|
|
|
|
if (map_page_size == PMD_SIZE) {
|
|
|
|
ptep = pmdp_ptep(pmdp);
|
|
|
|
goto set_the_pte;
|
|
|
|
}
|
|
|
|
if (!pmd_present(*pmdp)) {
|
2018-02-13 15:08:24 +00:00
|
|
|
ptep = early_alloc_pgtable(PAGE_SIZE, nid,
|
|
|
|
region_start, region_end);
|
2018-02-13 15:08:23 +00:00
|
|
|
pmd_populate_kernel(&init_mm, pmdp, ptep);
|
|
|
|
}
|
|
|
|
ptep = pte_offset_kernel(pmdp, ea);
|
|
|
|
|
|
|
|
set_the_pte:
|
2018-02-13 15:08:24 +00:00
|
|
|
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
|
powerpc/64s: Fix pte update for kernel memory on radix
When adding a PTE a ptesync is needed to order the update of the PTE
with subsequent accesses otherwise a spurious fault may be raised.
radix__set_pte_at() does not do this for performance gains. For
non-kernel memory this is not an issue as any faults of this kind are
corrected by the page fault handler. For kernel memory these faults
are not handled. The current solution is that there is a ptesync in
flush_cache_vmap() which should be called when mapping from the
vmalloc region.
However, map_kernel_page() does not call flush_cache_vmap(). This is
troublesome in particular for code patching with Strict RWX on radix.
In do_patch_instruction() the page frame that contains the instruction
to be patched is mapped and then immediately patched. With no ordering
or synchronization between setting up the PTE and writing to the page
it is possible for faults.
As the code patching is done using __put_user_asm_goto() the resulting
fault is obscured - but using a normal store instead it can be seen:
BUG: Unable to handle kernel data access on write at 0xc008000008f24a3c
Faulting instruction address: 0xc00000000008bd74
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: nop_module(PO+) [last unloaded: nop_module]
CPU: 4 PID: 757 Comm: sh Tainted: P O 5.10.0-rc5-01361-ge3c1b78c8440-dirty #43
NIP: c00000000008bd74 LR: c00000000008bd50 CTR: c000000000025810
REGS: c000000016f634a0 TRAP: 0300 Tainted: P O (5.10.0-rc5-01361-ge3c1b78c8440-dirty)
MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 44002884 XER: 00000000
CFAR: c00000000007c68c DAR: c008000008f24a3c DSISR: 42000000 IRQMASK: 1
This results in the kind of issue reported here:
https://lore.kernel.org/linuxppc-dev/15AC5B0E-A221-4B8C-9039-FA96B8EF7C88@lca.pw/
Chris Riedl suggested a reliable way to reproduce the issue:
$ mount -t debugfs none /sys/kernel/debug
$ (while true; do echo function > /sys/kernel/debug/tracing/current_tracer ; echo nop > /sys/kernel/debug/tracing/current_tracer ; done) &
Turning ftrace on and off does a large amount of code patching which
in usually less then 5min will crash giving a trace like:
ftrace-powerpc: (____ptrval____): replaced (4b473b11) != old (60000000)
------------[ ftrace bug ]------------
ftrace failed to modify
[<c000000000bf8e5c>] napi_busy_loop+0xc/0x390
actual: 11:3b:47:4b
Setting ftrace call site to call ftrace function
ftrace record flags: 80000001
(1)
expected tramp: c00000000006c96c
------------[ cut here ]------------
WARNING: CPU: 4 PID: 809 at kernel/trace/ftrace.c:2065 ftrace_bug+0x28c/0x2e8
Modules linked in: nop_module(PO-) [last unloaded: nop_module]
CPU: 4 PID: 809 Comm: sh Tainted: P O 5.10.0-rc5-01360-gf878ccaf250a #1
NIP: c00000000024f334 LR: c00000000024f330 CTR: c0000000001a5af0
REGS: c000000004c8b760 TRAP: 0700 Tainted: P O (5.10.0-rc5-01360-gf878ccaf250a)
MSR: 900000000282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 28008848 XER: 20040000
CFAR: c0000000001a9c98 IRQMASK: 0
GPR00: c00000000024f330 c000000004c8b9f0 c000000002770600 0000000000000022
GPR04: 00000000ffff7fff c000000004c8b6d0 0000000000000027 c0000007fe9bcdd8
GPR08: 0000000000000023 ffffffffffffffd8 0000000000000027 c000000002613118
GPR12: 0000000000008000 c0000007fffdca00 0000000000000000 0000000000000000
GPR16: 0000000023ec37c5 0000000000000000 0000000000000000 0000000000000008
GPR20: c000000004c8bc90 c0000000027a2d20 c000000004c8bcd0 c000000002612fe8
GPR24: 0000000000000038 0000000000000030 0000000000000028 0000000000000020
GPR28: c000000000ff1b68 c000000000bf8e5c c00000000312f700 c000000000fbb9b0
NIP ftrace_bug+0x28c/0x2e8
LR ftrace_bug+0x288/0x2e8
Call Trace:
ftrace_bug+0x288/0x2e8 (unreliable)
ftrace_modify_all_code+0x168/0x210
arch_ftrace_update_code+0x18/0x30
ftrace_run_update_code+0x44/0xc0
ftrace_startup+0xf8/0x1c0
register_ftrace_function+0x4c/0xc0
function_trace_init+0x80/0xb0
tracing_set_tracer+0x2a4/0x4f0
tracing_set_trace_write+0xd4/0x130
vfs_write+0xf0/0x330
ksys_write+0x84/0x140
system_call_exception+0x14c/0x230
system_call_common+0xf0/0x27c
To fix this when updating kernel memory PTEs using ptesync.
Fixes: f1cb8f9beba8 ("powerpc/64s/radix: avoid ptesync after set_pte and ptep_set_access_flags")
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Tidy up change log slightly]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210208032957.1232102-1-jniethe5@gmail.com
2021-02-08 03:29:56 +00:00
|
|
|
asm volatile("ptesync": : :"memory");
|
2018-02-13 15:08:23 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 15:08:24 +00:00
|
|
|
/*
|
|
|
|
* nid, region_start, and region_end are hints to try to place the page
|
|
|
|
* table memory in the same node or region.
|
|
|
|
*/
|
|
|
|
static int __map_kernel_page(unsigned long ea, unsigned long pa,
|
2016-04-29 13:25:58 +00:00
|
|
|
pgprot_t flags,
|
2018-02-13 15:08:24 +00:00
|
|
|
unsigned int map_page_size,
|
|
|
|
int nid,
|
|
|
|
unsigned long region_start, unsigned long region_end)
|
2016-04-29 13:25:58 +00:00
|
|
|
{
|
2018-02-13 15:08:24 +00:00
|
|
|
unsigned long pfn = pa >> PAGE_SHIFT;
|
2016-04-29 13:25:58 +00:00
|
|
|
pgd_t *pgdp;
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d_t *p4dp;
|
2016-04-29 13:25:58 +00:00
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pte_t *ptep;
|
|
|
|
/*
|
|
|
|
* Make sure task size is correct as per the max adddr
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
|
2018-02-13 15:08:23 +00:00
|
|
|
|
2019-04-17 12:59:14 +00:00
|
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
|
|
BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
|
|
|
|
#endif
|
|
|
|
|
2018-02-13 15:08:24 +00:00
|
|
|
if (unlikely(!slab_is_available()))
|
|
|
|
return early_map_kernel_page(ea, pa, flags, map_page_size,
|
|
|
|
nid, region_start, region_end);
|
2018-02-13 15:08:23 +00:00
|
|
|
|
2018-02-13 15:08:24 +00:00
|
|
|
/*
|
|
|
|
* Should make page table allocation functions be able to take a
|
|
|
|
* node, so we can place kernel page tables on the right nodes after
|
|
|
|
* boot.
|
|
|
|
*/
|
2018-02-13 15:08:23 +00:00
|
|
|
pgdp = pgd_offset_k(ea);
|
2020-06-04 23:46:44 +00:00
|
|
|
p4dp = p4d_offset(pgdp, ea);
|
|
|
|
pudp = pud_alloc(&init_mm, p4dp, ea);
|
2018-02-13 15:08:23 +00:00
|
|
|
if (!pudp)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (map_page_size == PUD_SIZE) {
|
|
|
|
ptep = (pte_t *)pudp;
|
|
|
|
goto set_the_pte;
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
2018-02-13 15:08:23 +00:00
|
|
|
pmdp = pmd_alloc(&init_mm, pudp, ea);
|
|
|
|
if (!pmdp)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (map_page_size == PMD_SIZE) {
|
|
|
|
ptep = pmdp_ptep(pmdp);
|
|
|
|
goto set_the_pte;
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
2018-02-13 15:08:23 +00:00
|
|
|
ptep = pte_alloc_kernel(pmdp, ea);
|
|
|
|
if (!ptep)
|
|
|
|
return -ENOMEM;
|
2016-04-29 13:25:58 +00:00
|
|
|
|
|
|
|
set_the_pte:
|
2018-02-13 15:08:24 +00:00
|
|
|
set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
|
powerpc/64s: Fix pte update for kernel memory on radix
When adding a PTE a ptesync is needed to order the update of the PTE
with subsequent accesses otherwise a spurious fault may be raised.
radix__set_pte_at() does not do this for performance gains. For
non-kernel memory this is not an issue as any faults of this kind are
corrected by the page fault handler. For kernel memory these faults
are not handled. The current solution is that there is a ptesync in
flush_cache_vmap() which should be called when mapping from the
vmalloc region.
However, map_kernel_page() does not call flush_cache_vmap(). This is
troublesome in particular for code patching with Strict RWX on radix.
In do_patch_instruction() the page frame that contains the instruction
to be patched is mapped and then immediately patched. With no ordering
or synchronization between setting up the PTE and writing to the page
it is possible for faults.
As the code patching is done using __put_user_asm_goto() the resulting
fault is obscured - but using a normal store instead it can be seen:
BUG: Unable to handle kernel data access on write at 0xc008000008f24a3c
Faulting instruction address: 0xc00000000008bd74
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: nop_module(PO+) [last unloaded: nop_module]
CPU: 4 PID: 757 Comm: sh Tainted: P O 5.10.0-rc5-01361-ge3c1b78c8440-dirty #43
NIP: c00000000008bd74 LR: c00000000008bd50 CTR: c000000000025810
REGS: c000000016f634a0 TRAP: 0300 Tainted: P O (5.10.0-rc5-01361-ge3c1b78c8440-dirty)
MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 44002884 XER: 00000000
CFAR: c00000000007c68c DAR: c008000008f24a3c DSISR: 42000000 IRQMASK: 1
This results in the kind of issue reported here:
https://lore.kernel.org/linuxppc-dev/15AC5B0E-A221-4B8C-9039-FA96B8EF7C88@lca.pw/
Chris Riedl suggested a reliable way to reproduce the issue:
$ mount -t debugfs none /sys/kernel/debug
$ (while true; do echo function > /sys/kernel/debug/tracing/current_tracer ; echo nop > /sys/kernel/debug/tracing/current_tracer ; done) &
Turning ftrace on and off does a large amount of code patching which
in usually less then 5min will crash giving a trace like:
ftrace-powerpc: (____ptrval____): replaced (4b473b11) != old (60000000)
------------[ ftrace bug ]------------
ftrace failed to modify
[<c000000000bf8e5c>] napi_busy_loop+0xc/0x390
actual: 11:3b:47:4b
Setting ftrace call site to call ftrace function
ftrace record flags: 80000001
(1)
expected tramp: c00000000006c96c
------------[ cut here ]------------
WARNING: CPU: 4 PID: 809 at kernel/trace/ftrace.c:2065 ftrace_bug+0x28c/0x2e8
Modules linked in: nop_module(PO-) [last unloaded: nop_module]
CPU: 4 PID: 809 Comm: sh Tainted: P O 5.10.0-rc5-01360-gf878ccaf250a #1
NIP: c00000000024f334 LR: c00000000024f330 CTR: c0000000001a5af0
REGS: c000000004c8b760 TRAP: 0700 Tainted: P O (5.10.0-rc5-01360-gf878ccaf250a)
MSR: 900000000282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 28008848 XER: 20040000
CFAR: c0000000001a9c98 IRQMASK: 0
GPR00: c00000000024f330 c000000004c8b9f0 c000000002770600 0000000000000022
GPR04: 00000000ffff7fff c000000004c8b6d0 0000000000000027 c0000007fe9bcdd8
GPR08: 0000000000000023 ffffffffffffffd8 0000000000000027 c000000002613118
GPR12: 0000000000008000 c0000007fffdca00 0000000000000000 0000000000000000
GPR16: 0000000023ec37c5 0000000000000000 0000000000000000 0000000000000008
GPR20: c000000004c8bc90 c0000000027a2d20 c000000004c8bcd0 c000000002612fe8
GPR24: 0000000000000038 0000000000000030 0000000000000028 0000000000000020
GPR28: c000000000ff1b68 c000000000bf8e5c c00000000312f700 c000000000fbb9b0
NIP ftrace_bug+0x28c/0x2e8
LR ftrace_bug+0x288/0x2e8
Call Trace:
ftrace_bug+0x288/0x2e8 (unreliable)
ftrace_modify_all_code+0x168/0x210
arch_ftrace_update_code+0x18/0x30
ftrace_run_update_code+0x44/0xc0
ftrace_startup+0xf8/0x1c0
register_ftrace_function+0x4c/0xc0
function_trace_init+0x80/0xb0
tracing_set_tracer+0x2a4/0x4f0
tracing_set_trace_write+0xd4/0x130
vfs_write+0xf0/0x330
ksys_write+0x84/0x140
system_call_exception+0x14c/0x230
system_call_common+0xf0/0x27c
To fix this when updating kernel memory PTEs using ptesync.
Fixes: f1cb8f9beba8 ("powerpc/64s/radix: avoid ptesync after set_pte and ptep_set_access_flags")
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Tidy up change log slightly]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20210208032957.1232102-1-jniethe5@gmail.com
2021-02-08 03:29:56 +00:00
|
|
|
asm volatile("ptesync": : :"memory");
|
2016-04-29 13:25:58 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-02-13 15:08:24 +00:00
|
|
|
int radix__map_kernel_page(unsigned long ea, unsigned long pa,
|
|
|
|
pgprot_t flags,
|
|
|
|
unsigned int map_page_size)
|
|
|
|
{
|
|
|
|
return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
|
|
|
|
}
|
|
|
|
|
2017-06-28 17:04:09 +00:00
|
|
|
#ifdef CONFIG_STRICT_KERNEL_RWX
|
2021-04-13 13:54:27 +00:00
|
|
|
static void radix__change_memory_range(unsigned long start, unsigned long end,
|
|
|
|
unsigned long clear)
|
2017-06-28 17:04:09 +00:00
|
|
|
{
|
|
|
|
unsigned long idx;
|
|
|
|
pgd_t *pgdp;
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d_t *p4dp;
|
2017-06-28 17:04:09 +00:00
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pte_t *ptep;
|
|
|
|
|
|
|
|
start = ALIGN_DOWN(start, PAGE_SIZE);
|
|
|
|
end = PAGE_ALIGN(end); // aligns up
|
|
|
|
|
2017-07-14 06:51:21 +00:00
|
|
|
pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
|
|
|
|
start, end, clear);
|
2017-06-28 17:04:09 +00:00
|
|
|
|
|
|
|
for (idx = start; idx < end; idx += PAGE_SIZE) {
|
|
|
|
pgdp = pgd_offset_k(idx);
|
2020-06-04 23:46:44 +00:00
|
|
|
p4dp = p4d_offset(pgdp, idx);
|
|
|
|
pudp = pud_alloc(&init_mm, p4dp, idx);
|
2017-06-28 17:04:09 +00:00
|
|
|
if (!pudp)
|
|
|
|
continue;
|
2024-03-05 04:37:42 +00:00
|
|
|
if (pud_leaf(*pudp)) {
|
2017-06-28 17:04:09 +00:00
|
|
|
ptep = (pte_t *)pudp;
|
|
|
|
goto update_the_pte;
|
|
|
|
}
|
|
|
|
pmdp = pmd_alloc(&init_mm, pudp, idx);
|
|
|
|
if (!pmdp)
|
|
|
|
continue;
|
2024-03-05 04:37:42 +00:00
|
|
|
if (pmd_leaf(*pmdp)) {
|
2017-06-28 17:04:09 +00:00
|
|
|
ptep = pmdp_ptep(pmdp);
|
|
|
|
goto update_the_pte;
|
|
|
|
}
|
|
|
|
ptep = pte_alloc_kernel(pmdp, idx);
|
|
|
|
if (!ptep)
|
|
|
|
continue;
|
|
|
|
update_the_pte:
|
2017-07-14 06:51:21 +00:00
|
|
|
radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
|
2017-06-28 17:04:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
radix__flush_tlb_kernel_range(start, end);
|
|
|
|
}
|
2017-07-14 06:51:21 +00:00
|
|
|
|
|
|
|
void radix__mark_rodata_ro(void)
|
|
|
|
{
|
|
|
|
unsigned long start, end;
|
|
|
|
|
|
|
|
start = (unsigned long)_stext;
|
2022-09-16 04:07:49 +00:00
|
|
|
end = (unsigned long)__end_rodata;
|
2017-07-14 06:51:21 +00:00
|
|
|
|
|
|
|
radix__change_memory_range(start, end, _PAGE_WRITE);
|
powerpc/64s/radix: Fix RWX mapping with relocated kernel
If a relocatable kernel is loaded at a non-zero address and told not to
relocate to zero (kdump or RELOCATABLE_TEST), the mapping of the
interrupt code at zero is left with RWX permissions.
That is a security weakness, and leads to a warning at boot if
CONFIG_DEBUG_WX is enabled:
powerpc/mm: Found insecure W+X mapping at address 00000000056435bc/0xc000000000000000
WARNING: CPU: 1 PID: 1 at arch/powerpc/mm/ptdump/ptdump.c:193 note_page+0x484/0x4c0
CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc1-00001-g8ae8e98aea82-dirty #175
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,git-dd0dca hv:linux,kvm pSeries
NIP: c0000000004a1c34 LR: c0000000004a1c30 CTR: 0000000000000000
REGS: c000000003503770 TRAP: 0700 Not tainted (6.2.0-rc1-00001-g8ae8e98aea82-dirty)
MSR: 8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 24000220 XER: 00000000
CFAR: c000000000545a58 IRQMASK: 0
...
NIP note_page+0x484/0x4c0
LR note_page+0x480/0x4c0
Call Trace:
note_page+0x480/0x4c0 (unreliable)
ptdump_pmd_entry+0xc8/0x100
walk_pgd_range+0x618/0xab0
walk_page_range_novma+0x74/0xc0
ptdump_walk_pgd+0x98/0x170
ptdump_check_wx+0x94/0x100
mark_rodata_ro+0x30/0x70
kernel_init+0x78/0x1a0
ret_from_kernel_thread+0x5c/0x64
The fix has two parts. Firstly the pages from zero up to the end of
interrupts need to be marked read-only, so that they are left with R-X
permissions. Secondly the mapping logic needs to be taught to ensure
there is a page boundary at the end of the interrupt region, so that the
permission change only applies to the interrupt text, and not the region
following it.
Fixes: c55d7b5e6426 ("powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE")
Reported-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230110124753.1325426-2-mpe@ellerman.id.au
2023-01-10 12:47:53 +00:00
|
|
|
|
|
|
|
for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
|
|
|
|
end = start + PAGE_SIZE;
|
|
|
|
if (overlaps_interrupt_vector_text(start, end))
|
|
|
|
radix__change_memory_range(start, end, _PAGE_WRITE);
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
2017-07-14 06:51:21 +00:00
|
|
|
}
|
2017-07-14 06:51:23 +00:00
|
|
|
|
|
|
|
void radix__mark_initmem_nx(void)
|
|
|
|
{
|
|
|
|
unsigned long start = (unsigned long)__init_begin;
|
|
|
|
unsigned long end = (unsigned long)__init_end;
|
|
|
|
|
|
|
|
radix__change_memory_range(start, end, _PAGE_EXEC);
|
|
|
|
}
|
2017-06-28 17:04:09 +00:00
|
|
|
#endif /* CONFIG_STRICT_KERNEL_RWX */
|
|
|
|
|
2018-10-17 12:53:38 +00:00
|
|
|
static inline void __meminit
|
|
|
|
print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
|
2017-01-16 19:07:43 +00:00
|
|
|
{
|
2017-08-30 07:41:17 +00:00
|
|
|
char buf[10];
|
|
|
|
|
2017-01-16 19:07:43 +00:00
|
|
|
if (end <= start)
|
|
|
|
return;
|
|
|
|
|
2017-08-30 07:41:17 +00:00
|
|
|
string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
|
|
|
|
|
2018-10-17 12:53:38 +00:00
|
|
|
pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
|
|
|
|
exec ? " (exec)" : "");
|
2017-01-16 19:07:43 +00:00
|
|
|
}
|
|
|
|
|
2018-08-14 12:37:32 +00:00
|
|
|
static unsigned long next_boundary(unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_STRICT_KERNEL_RWX
|
powerpc/64s/radix: Fix crash with unaligned relocated kernel
If a relocatable kernel is loaded at an address that is not 2MB aligned
and told not to relocate to zero, the kernel can crash due to
mark_rodata_ro() incorrectly changing some read-write data to read-only.
Scenarios where the misalignment can occur are when the kernel is
loaded by kdump or using the RELOCATABLE_TEST config option.
Example crash with the kernel loaded at 5MB:
Run /sbin/init as init process
BUG: Unable to handle kernel data access on write at 0xc000000000452000
Faulting instruction address: 0xc0000000005b6730
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
CPU: 1 PID: 1 Comm: init Not tainted 6.2.0-rc1-00011-g349188be4841 #166
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,git-5b4c5a hv:linux,kvm pSeries
NIP: c0000000005b6730 LR: c000000000ae9ab8 CTR: 0000000000000380
REGS: c000000004503250 TRAP: 0300 Not tainted (6.2.0-rc1-00011-g349188be4841)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 44288480 XER: 00000000
CFAR: c0000000005b66ec DAR: c000000000452000 DSISR: 0a000000 IRQMASK: 0
...
NIP memset+0x68/0x104
LR zero_user_segments.constprop.0+0xa8/0xf0
Call Trace:
ext4_mpage_readpages+0x7f8/0x830
ext4_readahead+0x48/0x60
read_pages+0xb8/0x380
page_cache_ra_unbounded+0x19c/0x250
filemap_fault+0x58c/0xae0
__do_fault+0x60/0x100
__handle_mm_fault+0x1230/0x1a40
handle_mm_fault+0x120/0x300
___do_page_fault+0x20c/0xa80
do_page_fault+0x30/0xc0
data_access_common_virt+0x210/0x220
This happens because mark_rodata_ro() tries to change permissions on the
range _stext..__end_rodata, but _stext sits in the middle of the 2MB
page from 4MB to 6MB:
radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages
radix-mmu: Mapped 0x0000000000400000-0x0000000002400000 with 2.00 MiB pages (exec)
The logic that changes the permissions assumes the linear mapping was
split correctly at boot, so it marks the entire 2MB page read-only. That
leads to the write fault above.
To fix it, the boot time mapping logic needs to consider that if the
kernel is running at a non-zero address then _stext is a boundary where
it must split the mapping.
That leads to the mapping being split correctly, allowing the rodata
permission change to take happen correctly, with no spillover:
radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages
radix-mmu: Mapped 0x0000000000400000-0x0000000000500000 with 64.0 KiB pages
radix-mmu: Mapped 0x0000000000500000-0x0000000000600000 with 64.0 KiB pages (exec)
radix-mmu: Mapped 0x0000000000600000-0x0000000002400000 with 2.00 MiB pages (exec)
If the kernel is loaded at a 2MB aligned address, the mapping continues
to use 2MB pages as before:
radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages
radix-mmu: Mapped 0x0000000000400000-0x0000000002c00000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000002c00000-0x0000000100000000 with 2.00 MiB pages
Fixes: c55d7b5e6426 ("powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230110124753.1325426-1-mpe@ellerman.id.au
2023-01-10 12:47:52 +00:00
|
|
|
unsigned long stext_phys;
|
|
|
|
|
|
|
|
stext_phys = __pa_symbol(_stext);
|
|
|
|
|
|
|
|
// Relocatable kernel running at non-zero real address
|
|
|
|
if (stext_phys != 0) {
|
powerpc/64s/radix: Fix RWX mapping with relocated kernel
If a relocatable kernel is loaded at a non-zero address and told not to
relocate to zero (kdump or RELOCATABLE_TEST), the mapping of the
interrupt code at zero is left with RWX permissions.
That is a security weakness, and leads to a warning at boot if
CONFIG_DEBUG_WX is enabled:
powerpc/mm: Found insecure W+X mapping at address 00000000056435bc/0xc000000000000000
WARNING: CPU: 1 PID: 1 at arch/powerpc/mm/ptdump/ptdump.c:193 note_page+0x484/0x4c0
CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc1-00001-g8ae8e98aea82-dirty #175
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,git-dd0dca hv:linux,kvm pSeries
NIP: c0000000004a1c34 LR: c0000000004a1c30 CTR: 0000000000000000
REGS: c000000003503770 TRAP: 0700 Not tainted (6.2.0-rc1-00001-g8ae8e98aea82-dirty)
MSR: 8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 24000220 XER: 00000000
CFAR: c000000000545a58 IRQMASK: 0
...
NIP note_page+0x484/0x4c0
LR note_page+0x480/0x4c0
Call Trace:
note_page+0x480/0x4c0 (unreliable)
ptdump_pmd_entry+0xc8/0x100
walk_pgd_range+0x618/0xab0
walk_page_range_novma+0x74/0xc0
ptdump_walk_pgd+0x98/0x170
ptdump_check_wx+0x94/0x100
mark_rodata_ro+0x30/0x70
kernel_init+0x78/0x1a0
ret_from_kernel_thread+0x5c/0x64
The fix has two parts. Firstly the pages from zero up to the end of
interrupts need to be marked read-only, so that they are left with R-X
permissions. Secondly the mapping logic needs to be taught to ensure
there is a page boundary at the end of the interrupt region, so that the
permission change only applies to the interrupt text, and not the region
following it.
Fixes: c55d7b5e6426 ("powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE")
Reported-by: Sachin Sant <sachinp@linux.ibm.com>
Tested-by: Sachin Sant <sachinp@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230110124753.1325426-2-mpe@ellerman.id.au
2023-01-10 12:47:53 +00:00
|
|
|
// The end of interrupts code at zero is a rodata boundary
|
|
|
|
unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
|
|
|
|
if (addr < end_intr)
|
|
|
|
return end_intr;
|
|
|
|
|
powerpc/64s/radix: Fix crash with unaligned relocated kernel
If a relocatable kernel is loaded at an address that is not 2MB aligned
and told not to relocate to zero, the kernel can crash due to
mark_rodata_ro() incorrectly changing some read-write data to read-only.
Scenarios where the misalignment can occur are when the kernel is
loaded by kdump or using the RELOCATABLE_TEST config option.
Example crash with the kernel loaded at 5MB:
Run /sbin/init as init process
BUG: Unable to handle kernel data access on write at 0xc000000000452000
Faulting instruction address: 0xc0000000005b6730
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
CPU: 1 PID: 1 Comm: init Not tainted 6.2.0-rc1-00011-g349188be4841 #166
Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,git-5b4c5a hv:linux,kvm pSeries
NIP: c0000000005b6730 LR: c000000000ae9ab8 CTR: 0000000000000380
REGS: c000000004503250 TRAP: 0300 Not tainted (6.2.0-rc1-00011-g349188be4841)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 44288480 XER: 00000000
CFAR: c0000000005b66ec DAR: c000000000452000 DSISR: 0a000000 IRQMASK: 0
...
NIP memset+0x68/0x104
LR zero_user_segments.constprop.0+0xa8/0xf0
Call Trace:
ext4_mpage_readpages+0x7f8/0x830
ext4_readahead+0x48/0x60
read_pages+0xb8/0x380
page_cache_ra_unbounded+0x19c/0x250
filemap_fault+0x58c/0xae0
__do_fault+0x60/0x100
__handle_mm_fault+0x1230/0x1a40
handle_mm_fault+0x120/0x300
___do_page_fault+0x20c/0xa80
do_page_fault+0x30/0xc0
data_access_common_virt+0x210/0x220
This happens because mark_rodata_ro() tries to change permissions on the
range _stext..__end_rodata, but _stext sits in the middle of the 2MB
page from 4MB to 6MB:
radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages
radix-mmu: Mapped 0x0000000000400000-0x0000000002400000 with 2.00 MiB pages (exec)
The logic that changes the permissions assumes the linear mapping was
split correctly at boot, so it marks the entire 2MB page read-only. That
leads to the write fault above.
To fix it, the boot time mapping logic needs to consider that if the
kernel is running at a non-zero address then _stext is a boundary where
it must split the mapping.
That leads to the mapping being split correctly, allowing the rodata
permission change to take happen correctly, with no spillover:
radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages
radix-mmu: Mapped 0x0000000000400000-0x0000000000500000 with 64.0 KiB pages
radix-mmu: Mapped 0x0000000000500000-0x0000000000600000 with 64.0 KiB pages (exec)
radix-mmu: Mapped 0x0000000000600000-0x0000000002400000 with 2.00 MiB pages (exec)
If the kernel is loaded at a 2MB aligned address, the mapping continues
to use 2MB pages as before:
radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages
radix-mmu: Mapped 0x0000000000400000-0x0000000002c00000 with 2.00 MiB pages (exec)
radix-mmu: Mapped 0x0000000002c00000-0x0000000100000000 with 2.00 MiB pages
Fixes: c55d7b5e6426 ("powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20230110124753.1325426-1-mpe@ellerman.id.au
2023-01-10 12:47:52 +00:00
|
|
|
// Start of relocated kernel text is a rodata boundary
|
|
|
|
if (addr < stext_phys)
|
|
|
|
return stext_phys;
|
|
|
|
}
|
|
|
|
|
2022-09-16 04:41:24 +00:00
|
|
|
if (addr < __pa_symbol(__srwx_boundary))
|
|
|
|
return __pa_symbol(__srwx_boundary);
|
2018-08-14 12:37:32 +00:00
|
|
|
#endif
|
|
|
|
return end;
|
|
|
|
}
|
|
|
|
|
2017-01-16 19:07:43 +00:00
|
|
|
static int __meminit create_physical_mapping(unsigned long start,
|
2018-02-13 15:08:24 +00:00
|
|
|
unsigned long end,
|
2020-04-10 21:33:32 +00:00
|
|
|
int nid, pgprot_t _prot)
|
2017-01-16 19:07:43 +00:00
|
|
|
{
|
2017-06-06 05:48:57 +00:00
|
|
|
unsigned long vaddr, addr, mapping_size = 0;
|
2018-10-17 12:53:38 +00:00
|
|
|
bool prev_exec, exec = false;
|
2017-06-06 05:48:57 +00:00
|
|
|
pgprot_t prot;
|
2018-08-13 05:44:57 +00:00
|
|
|
int psize;
|
2023-08-01 04:44:46 +00:00
|
|
|
unsigned long max_mapping_size = memory_block_size;
|
2022-09-26 07:57:23 +00:00
|
|
|
|
2022-09-26 07:57:26 +00:00
|
|
|
if (debug_pagealloc_enabled_or_kfence())
|
2022-09-26 07:57:23 +00:00
|
|
|
max_mapping_size = PAGE_SIZE;
|
2017-01-16 19:07:43 +00:00
|
|
|
|
2020-04-20 18:36:36 +00:00
|
|
|
start = ALIGN(start, PAGE_SIZE);
|
2020-09-07 07:25:39 +00:00
|
|
|
end = ALIGN_DOWN(end, PAGE_SIZE);
|
2017-01-16 19:07:43 +00:00
|
|
|
for (addr = start; addr < end; addr += mapping_size) {
|
|
|
|
unsigned long gap, previous_size;
|
|
|
|
int rc;
|
|
|
|
|
2018-08-14 12:37:32 +00:00
|
|
|
gap = next_boundary(addr, end) - addr;
|
2020-07-09 13:19:25 +00:00
|
|
|
if (gap > max_mapping_size)
|
|
|
|
gap = max_mapping_size;
|
2017-01-16 19:07:43 +00:00
|
|
|
previous_size = mapping_size;
|
2018-10-17 12:53:38 +00:00
|
|
|
prev_exec = exec;
|
2017-01-16 19:07:43 +00:00
|
|
|
|
|
|
|
if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
|
2018-08-14 12:01:44 +00:00
|
|
|
mmu_psize_defs[MMU_PAGE_1G].shift) {
|
2017-01-16 19:07:43 +00:00
|
|
|
mapping_size = PUD_SIZE;
|
2018-08-13 05:44:57 +00:00
|
|
|
psize = MMU_PAGE_1G;
|
|
|
|
} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
|
|
|
|
mmu_psize_defs[MMU_PAGE_2M].shift) {
|
2017-01-16 19:07:43 +00:00
|
|
|
mapping_size = PMD_SIZE;
|
2018-08-13 05:44:57 +00:00
|
|
|
psize = MMU_PAGE_2M;
|
|
|
|
} else {
|
2017-01-16 19:07:43 +00:00
|
|
|
mapping_size = PAGE_SIZE;
|
2018-08-13 05:44:57 +00:00
|
|
|
psize = mmu_virtual_psize;
|
|
|
|
}
|
2017-06-28 17:04:09 +00:00
|
|
|
|
2017-06-06 05:48:57 +00:00
|
|
|
vaddr = (unsigned long)__va(addr);
|
|
|
|
|
2017-06-28 17:04:10 +00:00
|
|
|
if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
|
2018-10-17 12:53:38 +00:00
|
|
|
overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
|
2017-06-06 05:48:57 +00:00
|
|
|
prot = PAGE_KERNEL_X;
|
2018-10-17 12:53:38 +00:00
|
|
|
exec = true;
|
|
|
|
} else {
|
2020-04-10 21:33:32 +00:00
|
|
|
prot = _prot;
|
2018-10-17 12:53:38 +00:00
|
|
|
exec = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mapping_size != previous_size || exec != prev_exec) {
|
|
|
|
print_mapping(start, addr, previous_size, prev_exec);
|
|
|
|
start = addr;
|
|
|
|
}
|
2017-06-06 05:48:57 +00:00
|
|
|
|
2018-02-13 15:08:24 +00:00
|
|
|
rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
|
2017-01-16 19:07:43 +00:00
|
|
|
if (rc)
|
|
|
|
return rc;
|
2018-08-13 05:44:57 +00:00
|
|
|
|
|
|
|
update_page_count(psize, 1);
|
2017-01-16 19:07:43 +00:00
|
|
|
}
|
|
|
|
|
2018-10-17 12:53:38 +00:00
|
|
|
print_mapping(start, addr, mapping_size, exec);
|
2017-01-16 19:07:43 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-05-04 10:24:27 +00:00
|
|
|
static void __init radix_init_pgtable(void)
|
2016-04-29 13:25:58 +00:00
|
|
|
{
|
|
|
|
unsigned long rts_field;
|
2020-10-13 23:58:08 +00:00
|
|
|
phys_addr_t start, end;
|
|
|
|
u64 i;
|
2016-04-29 13:25:58 +00:00
|
|
|
|
|
|
|
/* We don't support slb for radix */
|
2021-12-01 14:41:52 +00:00
|
|
|
slb_set_size(0);
|
2020-07-09 13:19:25 +00:00
|
|
|
|
2016-04-29 13:25:58 +00:00
|
|
|
/*
|
2020-07-09 13:19:25 +00:00
|
|
|
* Create the linear mapping
|
2016-04-29 13:25:58 +00:00
|
|
|
*/
|
2020-10-13 23:58:08 +00:00
|
|
|
for_each_mem_range(i, &start, &end) {
|
2018-02-13 15:08:24 +00:00
|
|
|
/*
|
|
|
|
* The memblock allocator is up at this point, so the
|
|
|
|
* page tables will be allocated within the range. No
|
|
|
|
* need or a node (which we don't have yet).
|
|
|
|
*/
|
2019-04-17 12:59:15 +00:00
|
|
|
|
2020-10-13 23:58:08 +00:00
|
|
|
if (end >= RADIX_VMALLOC_START) {
|
2019-04-23 15:10:17 +00:00
|
|
|
pr_warn("Outside the supported range\n");
|
2019-04-17 12:59:15 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-10-13 23:58:08 +00:00
|
|
|
WARN_ON(create_physical_mapping(start, end,
|
2020-04-10 21:33:32 +00:00
|
|
|
-1, PAGE_KERNEL));
|
2018-02-13 15:08:24 +00:00
|
|
|
}
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
|
2021-05-28 09:07:41 +00:00
|
|
|
if (!cpu_has_feature(CPU_FTR_HVMODE) &&
|
|
|
|
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
/*
|
2022-04-30 18:56:54 +00:00
|
|
|
* Older versions of KVM on these machines prefer if the
|
2021-05-28 09:07:41 +00:00
|
|
|
* guest only uses the low 19 PID bits.
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
*/
|
2021-11-29 03:09:15 +00:00
|
|
|
mmu_pid_bits = 19;
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
}
|
2021-05-28 09:07:41 +00:00
|
|
|
mmu_base_pid = 1;
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
|
2016-04-29 13:25:58 +00:00
|
|
|
/*
|
|
|
|
* Allocate Partition table and process table for the
|
|
|
|
* host.
|
|
|
|
*/
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
BUG_ON(PRTB_SIZE_SHIFT > 36);
|
2018-02-13 15:08:24 +00:00
|
|
|
process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
|
2016-04-29 13:25:58 +00:00
|
|
|
/*
|
|
|
|
* Fill in the process table.
|
|
|
|
*/
|
2016-06-17 06:10:36 +00:00
|
|
|
rts_field = radix__get_tree_size();
|
2016-04-29 13:25:58 +00:00
|
|
|
process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
|
2019-09-02 15:29:26 +00:00
|
|
|
|
powerpc/64s/radix: Boot-time NULL pointer protection using a guard-PID
This change restores and formalises the behaviour that access to NULL
or other user addresses by the kernel during boot should fault rather
than succeed and modify memory. This was inadvertently broken when
fixing another bug, because it was previously not well defined and
only worked by chance.
powerpc/64s/radix uses high address bits to select an address space
"quadrant", which determines which PID and LPID are used to translate
the rest of the address (effective PID, effective LPID). The kernel
mapping at 0xC... selects quadrant 3, which uses PID=0 and LPID=0. So
the kernel page tables are installed in the PID 0 process table entry.
An address at 0x0... selects quadrant 0, which uses PID=PIDR for
translating the rest of the address (that is, it uses the value of the
PIDR register as the effective PID). If PIDR=0, then the translation
is performed with the PID 0 process table entry page tables. This is
the kernel mapping, so we effectively get another copy of the kernel
address space at 0. A NULL pointer access will access physical memory
address 0.
To prevent duplicating the kernel address space in quadrant 0, this
patch allocates a guard PID containing no translations, and
initializes PIDR with this during boot, before the MMU is switched on.
Any kernel access to quadrant 0 will use this guard PID for
translation and find no valid mappings, and therefore fault.
After boot, this PID will be switchd away to user context PIDs, but
those contain user mappings (and usually NULL pointer protection)
rather than kernel mapping, which is much safer (and by design). It
may be in future this is tightened further, which the guard PID could
be used for.
Commit 371b8044 ("powerpc/64s: Initialize ISAv3 MMU registers before
setting partition table"), introduced this problem because it zeroes
PIDR at boot. However previously the value was inherited from firmware
or kexec, which is not robust and can be zero (e.g., mambo).
Fixes: 371b80447ff3 ("powerpc/64s: Initialize ISAv3 MMU registers before setting partition table")
Cc: stable@vger.kernel.org # v4.15+
Reported-by: Florian Weimer <fweimer@redhat.com>
Tested-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-02-07 01:20:02 +00:00
|
|
|
/*
|
|
|
|
* The init_mm context is given the first available (non-zero) PID,
|
|
|
|
* which is the "guard PID" and contains no page table. PIDR should
|
|
|
|
* never be set to zero because that duplicates the kernel address
|
|
|
|
* space at the 0x0... offset (quadrant 0)!
|
|
|
|
*
|
|
|
|
* An arbitrary PID that may later be allocated by the PID allocator
|
|
|
|
* for userspace processes must not be used either, because that
|
|
|
|
* would cause stale user mappings for that PID on CPUs outside of
|
|
|
|
* the TLB invalidation scheme (because it won't be in mm_cpumask).
|
|
|
|
*
|
|
|
|
* So permanently carve out one PID for the purpose of a guard PID.
|
|
|
|
*/
|
|
|
|
init_mm.context.id = mmu_base_pid;
|
|
|
|
mmu_base_pid++;
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __init radix_init_partition_table(void)
|
|
|
|
{
|
2019-09-02 15:29:26 +00:00
|
|
|
unsigned long rts_field, dw0, dw1;
|
2016-06-17 06:10:36 +00:00
|
|
|
|
2016-11-21 05:00:58 +00:00
|
|
|
mmu_partition_table_init();
|
2016-06-17 06:10:36 +00:00
|
|
|
rts_field = radix__get_tree_size();
|
2016-11-21 05:00:58 +00:00
|
|
|
dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
|
2019-09-02 15:29:26 +00:00
|
|
|
dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
|
2019-09-02 15:29:30 +00:00
|
|
|
mmu_partition_table_set_entry(0, dw0, dw1, false);
|
2016-04-29 13:25:58 +00:00
|
|
|
|
2016-07-13 09:35:25 +00:00
|
|
|
pr_info("Initializing Radix MMU\n");
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __init get_idx_from_shift(unsigned int shift)
|
|
|
|
{
|
|
|
|
int idx = -1;
|
|
|
|
|
|
|
|
switch (shift) {
|
|
|
|
case 0xc:
|
|
|
|
idx = MMU_PAGE_4K;
|
|
|
|
break;
|
|
|
|
case 0x10:
|
|
|
|
idx = MMU_PAGE_64K;
|
|
|
|
break;
|
|
|
|
case 0x15:
|
|
|
|
idx = MMU_PAGE_2M;
|
|
|
|
break;
|
|
|
|
case 0x1e:
|
|
|
|
idx = MMU_PAGE_1G;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init radix_dt_scan_page_sizes(unsigned long node,
|
|
|
|
const char *uname, int depth,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
int size = 0;
|
|
|
|
int shift, idx;
|
|
|
|
unsigned int ap;
|
|
|
|
const __be32 *prop;
|
|
|
|
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
|
|
|
|
|
|
|
|
/* We are scanning "cpu" nodes only */
|
|
|
|
if (type == NULL || strcmp(type, "cpu") != 0)
|
|
|
|
return 0;
|
|
|
|
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 04:26:06 +00:00
|
|
|
/* Grab page size encodings */
|
2016-04-29 13:25:58 +00:00
|
|
|
prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
|
|
|
|
if (!prop)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
pr_info("Page sizes from device-tree:\n");
|
|
|
|
for (; size >= 4; size -= 4, ++prop) {
|
|
|
|
|
|
|
|
struct mmu_psize_def *def;
|
|
|
|
|
|
|
|
/* top 3 bit is AP encoding */
|
|
|
|
shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
|
|
|
|
ap = be32_to_cpu(prop[0]) >> 29;
|
2016-11-05 04:24:22 +00:00
|
|
|
pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
|
2016-04-29 13:25:58 +00:00
|
|
|
|
|
|
|
idx = get_idx_from_shift(shift);
|
|
|
|
if (idx < 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
def = &mmu_psize_defs[idx];
|
|
|
|
def->shift = shift;
|
|
|
|
def->ap = ap;
|
2021-06-21 08:49:59 +00:00
|
|
|
def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* needed ? */
|
|
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2016-07-26 11:55:27 +00:00
|
|
|
void __init radix__early_init_devtree(void)
|
2016-04-29 13:25:58 +00:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to find the available page sizes in the device-tree
|
|
|
|
*/
|
|
|
|
rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
|
2020-07-09 13:19:25 +00:00
|
|
|
if (!rc) {
|
|
|
|
/*
|
|
|
|
* No page size details found in device tree.
|
|
|
|
* Let's assume we have page 4k and 64k support
|
|
|
|
*/
|
|
|
|
mmu_psize_defs[MMU_PAGE_4K].shift = 12;
|
|
|
|
mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
|
2021-06-21 08:49:59 +00:00
|
|
|
mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
|
|
|
|
psize_to_rpti_pgsize(MMU_PAGE_4K);
|
2020-07-09 13:19:25 +00:00
|
|
|
|
|
|
|
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
|
|
|
|
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
|
2021-06-21 08:49:59 +00:00
|
|
|
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
|
|
|
|
psize_to_rpti_pgsize(MMU_PAGE_64K);
|
2020-07-09 13:19:25 +00:00
|
|
|
}
|
2016-04-29 13:25:58 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init radix__early_init_mmu(void)
|
|
|
|
{
|
|
|
|
unsigned long lpcr;
|
|
|
|
|
2021-12-01 14:41:52 +00:00
|
|
|
#ifdef CONFIG_PPC_64S_HASH_MMU
|
2016-04-29 13:25:58 +00:00
|
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
|
|
/* PAGE_SIZE mappings */
|
|
|
|
mmu_virtual_psize = MMU_PAGE_64K;
|
|
|
|
#else
|
|
|
|
mmu_virtual_psize = MMU_PAGE_4K;
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* initialize page table size
|
|
|
|
*/
|
|
|
|
__pte_index_size = RADIX_PTE_INDEX_SIZE;
|
|
|
|
__pmd_index_size = RADIX_PMD_INDEX_SIZE;
|
|
|
|
__pud_index_size = RADIX_PUD_INDEX_SIZE;
|
|
|
|
__pgd_index_size = RADIX_PGD_INDEX_SIZE;
|
2018-02-11 15:00:06 +00:00
|
|
|
__pud_cache_index = RADIX_PUD_INDEX_SIZE;
|
2016-04-29 13:25:58 +00:00
|
|
|
__pte_table_size = RADIX_PTE_TABLE_SIZE;
|
|
|
|
__pmd_table_size = RADIX_PMD_TABLE_SIZE;
|
|
|
|
__pud_table_size = RADIX_PUD_TABLE_SIZE;
|
|
|
|
__pgd_table_size = RADIX_PGD_TABLE_SIZE;
|
|
|
|
|
2016-04-29 13:26:19 +00:00
|
|
|
__pmd_val_bits = RADIX_PMD_VAL_BITS;
|
|
|
|
__pud_val_bits = RADIX_PUD_VAL_BITS;
|
|
|
|
__pgd_val_bits = RADIX_PGD_VAL_BITS;
|
2016-04-29 13:25:58 +00:00
|
|
|
|
2016-04-29 13:26:21 +00:00
|
|
|
__kernel_virt_start = RADIX_KERN_VIRT_START;
|
|
|
|
__vmalloc_start = RADIX_VMALLOC_START;
|
|
|
|
__vmalloc_end = RADIX_VMALLOC_END;
|
2017-08-01 10:29:22 +00:00
|
|
|
__kernel_io_start = RADIX_KERN_IO_START;
|
2019-04-17 12:59:13 +00:00
|
|
|
__kernel_io_end = RADIX_KERN_IO_END;
|
2019-04-17 12:59:14 +00:00
|
|
|
vmemmap = (struct page *)RADIX_VMEMMAP_START;
|
2016-04-29 13:26:21 +00:00
|
|
|
ioremap_bot = IOREMAP_BASE;
|
2016-06-29 20:06:28 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_PCI
|
|
|
|
pci_io_base = ISA_IO_BASE;
|
|
|
|
#endif
|
2018-03-22 08:43:50 +00:00
|
|
|
__pte_frag_nr = RADIX_PTE_FRAG_NR;
|
|
|
|
__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
|
2018-04-16 11:27:22 +00:00
|
|
|
__pmd_frag_nr = RADIX_PMD_FRAG_NR;
|
|
|
|
__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
|
2016-04-29 13:26:21 +00:00
|
|
|
|
2019-09-02 15:29:26 +00:00
|
|
|
radix_init_pgtable();
|
|
|
|
|
2016-05-31 06:26:29 +00:00
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
|
|
|
|
lpcr = mfspr(SPRN_LPCR);
|
2016-07-13 09:35:21 +00:00
|
|
|
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
|
2016-04-29 13:25:58 +00:00
|
|
|
radix_init_partition_table();
|
2017-01-30 10:21:36 +00:00
|
|
|
} else {
|
|
|
|
radix_init_pseries();
|
2016-05-31 06:26:29 +00:00
|
|
|
}
|
2016-04-29 13:25:58 +00:00
|
|
|
|
2016-11-21 05:00:58 +00:00
|
|
|
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
|
|
|
|
|
powerpc/64s/radix: Boot-time NULL pointer protection using a guard-PID
This change restores and formalises the behaviour that access to NULL
or other user addresses by the kernel during boot should fault rather
than succeed and modify memory. This was inadvertently broken when
fixing another bug, because it was previously not well defined and
only worked by chance.
powerpc/64s/radix uses high address bits to select an address space
"quadrant", which determines which PID and LPID are used to translate
the rest of the address (effective PID, effective LPID). The kernel
mapping at 0xC... selects quadrant 3, which uses PID=0 and LPID=0. So
the kernel page tables are installed in the PID 0 process table entry.
An address at 0x0... selects quadrant 0, which uses PID=PIDR for
translating the rest of the address (that is, it uses the value of the
PIDR register as the effective PID). If PIDR=0, then the translation
is performed with the PID 0 process table entry page tables. This is
the kernel mapping, so we effectively get another copy of the kernel
address space at 0. A NULL pointer access will access physical memory
address 0.
To prevent duplicating the kernel address space in quadrant 0, this
patch allocates a guard PID containing no translations, and
initializes PIDR with this during boot, before the MMU is switched on.
Any kernel access to quadrant 0 will use this guard PID for
translation and find no valid mappings, and therefore fault.
After boot, this PID will be switchd away to user context PIDs, but
those contain user mappings (and usually NULL pointer protection)
rather than kernel mapping, which is much safer (and by design). It
may be in future this is tightened further, which the guard PID could
be used for.
Commit 371b8044 ("powerpc/64s: Initialize ISAv3 MMU registers before
setting partition table"), introduced this problem because it zeroes
PIDR at boot. However previously the value was inherited from firmware
or kexec, which is not robust and can be zero (e.g., mambo).
Fixes: 371b80447ff3 ("powerpc/64s: Initialize ISAv3 MMU registers before setting partition table")
Cc: stable@vger.kernel.org # v4.15+
Reported-by: Florian Weimer <fweimer@redhat.com>
Tested-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-02-07 01:20:02 +00:00
|
|
|
/* Switch to the guard PID before turning on MMU */
|
|
|
|
radix__switch_mmu_context(NULL, &init_mm);
|
2019-09-02 15:29:29 +00:00
|
|
|
tlbiel_all();
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void radix__early_init_mmu_secondary(void)
|
|
|
|
{
|
|
|
|
unsigned long lpcr;
|
|
|
|
/*
|
2016-05-31 06:26:29 +00:00
|
|
|
* update partition table control register and UPRT
|
2016-04-29 13:25:58 +00:00
|
|
|
*/
|
2016-05-31 06:26:29 +00:00
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
|
|
|
|
lpcr = mfspr(SPRN_LPCR);
|
2016-07-13 09:35:21 +00:00
|
|
|
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
|
2016-05-31 06:26:29 +00:00
|
|
|
|
2019-08-22 03:48:36 +00:00
|
|
|
set_ptcr_when_no_uv(__pa(partition_tb) |
|
|
|
|
(PATB_SIZE_SHIFT - 12));
|
2016-05-31 06:26:29 +00:00
|
|
|
}
|
powerpc/64s: Improve local TLB flush for boot and MCE on POWER9
There are several cases outside the normal address space management
where a CPU's entire local TLB is to be flushed:
1. Booting the kernel, in case something has left stale entries in
the TLB (e.g., kexec).
2. Machine check, to clean corrupted TLB entries.
One other place where the TLB is flushed, is waking from deep idle
states. The flush is a side-effect of calling ->cpu_restore with the
intention of re-setting various SPRs. The flush itself is unnecessary
because in the first case, the TLB should not acquire new corrupted
TLB entries as part of sleep/wake (though they may be lost).
This type of TLB flush is coded inflexibly, several times for each CPU
type, and they have a number of problems with ISA v3.0B:
- The current radix mode of the MMU is not taken into account, it is
always done as a hash flushn For IS=2 (LPID-matching flush from host)
and IS=3 with HV=0 (guest kernel flush), tlbie(l) is undefined if
the R field does not match the current radix mode.
- ISA v3.0B hash must flush the partition and process table caches as
well.
- ISA v3.0B radix must flush partition and process scoped translations,
partition and process table caches, and also the page walk cache.
So consolidate the flushing code and implement it in C and inline asm
under the mm/ directory with the rest of the flush code. Add ISA v3.0B
cases for radix and hash, and use the radix flush in radix environment.
Provide a way for IS=2 (LPID flush) to specify the radix mode of the
partition. Have KVM pass in the radix mode of the guest.
Take out the flushes from early cputable/dt_cpu_ftrs detection hooks,
and move it later in the boot process after, the MMU registers are set
up and before relocation is first turned on.
The TLB flush is no longer called when restoring from deep idle states.
This was not be done as a separate step because booting secondaries
uses the same cpu_restore as idle restore, which needs the TLB flush.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-12-23 15:15:50 +00:00
|
|
|
|
powerpc/64s/radix: Boot-time NULL pointer protection using a guard-PID
This change restores and formalises the behaviour that access to NULL
or other user addresses by the kernel during boot should fault rather
than succeed and modify memory. This was inadvertently broken when
fixing another bug, because it was previously not well defined and
only worked by chance.
powerpc/64s/radix uses high address bits to select an address space
"quadrant", which determines which PID and LPID are used to translate
the rest of the address (effective PID, effective LPID). The kernel
mapping at 0xC... selects quadrant 3, which uses PID=0 and LPID=0. So
the kernel page tables are installed in the PID 0 process table entry.
An address at 0x0... selects quadrant 0, which uses PID=PIDR for
translating the rest of the address (that is, it uses the value of the
PIDR register as the effective PID). If PIDR=0, then the translation
is performed with the PID 0 process table entry page tables. This is
the kernel mapping, so we effectively get another copy of the kernel
address space at 0. A NULL pointer access will access physical memory
address 0.
To prevent duplicating the kernel address space in quadrant 0, this
patch allocates a guard PID containing no translations, and
initializes PIDR with this during boot, before the MMU is switched on.
Any kernel access to quadrant 0 will use this guard PID for
translation and find no valid mappings, and therefore fault.
After boot, this PID will be switchd away to user context PIDs, but
those contain user mappings (and usually NULL pointer protection)
rather than kernel mapping, which is much safer (and by design). It
may be in future this is tightened further, which the guard PID could
be used for.
Commit 371b8044 ("powerpc/64s: Initialize ISAv3 MMU registers before
setting partition table"), introduced this problem because it zeroes
PIDR at boot. However previously the value was inherited from firmware
or kexec, which is not robust and can be zero (e.g., mambo).
Fixes: 371b80447ff3 ("powerpc/64s: Initialize ISAv3 MMU registers before setting partition table")
Cc: stable@vger.kernel.org # v4.15+
Reported-by: Florian Weimer <fweimer@redhat.com>
Tested-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-02-07 01:20:02 +00:00
|
|
|
radix__switch_mmu_context(NULL, &init_mm);
|
2019-09-02 15:29:29 +00:00
|
|
|
tlbiel_all();
|
2020-11-27 04:44:06 +00:00
|
|
|
|
|
|
|
/* Make sure userspace can't change the AMR */
|
|
|
|
mtspr(SPRN_UAMOR, 0);
|
2016-04-29 13:25:58 +00:00
|
|
|
}
|
|
|
|
|
2021-07-14 12:47:58 +00:00
|
|
|
/* Called during kexec sequence with MMU off */
|
|
|
|
notrace void radix__mmu_cleanup_all(void)
|
2016-08-19 08:52:37 +00:00
|
|
|
{
|
|
|
|
unsigned long lpcr;
|
|
|
|
|
|
|
|
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
|
|
|
|
lpcr = mfspr(SPRN_LPCR);
|
|
|
|
mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
|
2019-08-22 03:48:36 +00:00
|
|
|
set_ptcr_when_no_uv(0);
|
2016-12-14 02:36:51 +00:00
|
|
|
powernv_set_nmmu_ptcr(0);
|
2016-08-19 08:52:37 +00:00
|
|
|
radix__flush_tlb_all();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-01-16 19:07:44 +00:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2017-01-16 19:07:45 +00:00
|
|
|
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
|
|
|
|
{
|
|
|
|
pte_t *pte;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PTE; i++) {
|
|
|
|
pte = pte_start + i;
|
|
|
|
if (!pte_none(*pte))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pte_free_kernel(&init_mm, pte_start);
|
|
|
|
pmd_clear(pmd);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PMD; i++) {
|
|
|
|
pmd = pmd_start + i;
|
|
|
|
if (!pmd_none(*pmd))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_free(&init_mm, pmd_start);
|
|
|
|
pud_clear(pud);
|
|
|
|
}
|
|
|
|
|
2020-07-09 13:19:23 +00:00
|
|
|
static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PUD; i++) {
|
|
|
|
pud = pud_start + i;
|
|
|
|
if (!pud_none(*pud))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pud_free(&init_mm, pud_start);
|
|
|
|
p4d_clear(p4d);
|
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:56 +00:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
|
|
|
|
|
|
|
|
return !vmemmap_populated(start, PMD_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
|
|
|
|
{
|
|
|
|
unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
|
|
|
|
|
|
|
|
return !vmemmap_populated(start, PAGE_SIZE);
|
|
|
|
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void __meminit free_vmemmap_pages(struct page *page,
|
|
|
|
struct vmem_altmap *altmap,
|
|
|
|
int order)
|
|
|
|
{
|
|
|
|
unsigned int nr_pages = 1 << order;
|
|
|
|
|
|
|
|
if (altmap) {
|
|
|
|
unsigned long alt_start, alt_end;
|
|
|
|
unsigned long base_pfn = page_to_pfn(page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* with 2M vmemmap mmaping we can have things setup
|
|
|
|
* such that even though atlmap is specified we never
|
|
|
|
* used altmap.
|
|
|
|
*/
|
|
|
|
alt_start = altmap->base_pfn;
|
|
|
|
alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
|
|
|
|
|
|
|
|
if (base_pfn >= alt_start && base_pfn < alt_end) {
|
|
|
|
vmem_altmap_free(altmap, nr_pages);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PageReserved(page)) {
|
|
|
|
/* allocated from memblock */
|
|
|
|
while (nr_pages--)
|
|
|
|
free_reserved_page(page++);
|
|
|
|
} else
|
|
|
|
free_pages((unsigned long)page_address(page), order);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
|
|
|
|
unsigned long end, bool direct,
|
|
|
|
struct vmem_altmap *altmap)
|
2017-01-16 19:07:45 +00:00
|
|
|
{
|
2023-06-16 11:08:13 +00:00
|
|
|
unsigned long next, pages = 0;
|
2017-01-16 19:07:45 +00:00
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pte = pte_start + pte_index(addr);
|
|
|
|
for (; addr < end; addr = next, pte++) {
|
|
|
|
next = (addr + PAGE_SIZE) & PAGE_MASK;
|
|
|
|
if (next > end)
|
|
|
|
next = end;
|
|
|
|
|
|
|
|
if (!pte_present(*pte))
|
|
|
|
continue;
|
|
|
|
|
2023-07-24 19:07:56 +00:00
|
|
|
if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
|
|
|
|
if (!direct)
|
|
|
|
free_vmemmap_pages(pte_page(*pte), altmap, 0);
|
|
|
|
pte_clear(&init_mm, addr, pte);
|
|
|
|
pages++;
|
2017-01-16 19:07:46 +00:00
|
|
|
}
|
2023-07-24 19:07:56 +00:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
else if (!direct && vmemmap_page_is_unused(addr, next)) {
|
|
|
|
free_vmemmap_pages(pte_page(*pte), altmap, 0);
|
|
|
|
pte_clear(&init_mm, addr, pte);
|
|
|
|
}
|
|
|
|
#endif
|
2017-01-16 19:07:45 +00:00
|
|
|
}
|
2023-06-16 11:08:13 +00:00
|
|
|
if (direct)
|
|
|
|
update_page_count(mmu_virtual_psize, -pages);
|
2017-01-16 19:07:45 +00:00
|
|
|
}
|
|
|
|
|
2020-07-29 13:37:41 +00:00
|
|
|
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
|
2023-07-24 19:07:56 +00:00
|
|
|
unsigned long end, bool direct,
|
|
|
|
struct vmem_altmap *altmap)
|
2017-01-16 19:07:45 +00:00
|
|
|
{
|
2023-06-16 11:08:13 +00:00
|
|
|
unsigned long next, pages = 0;
|
2017-01-16 19:07:45 +00:00
|
|
|
pte_t *pte_base;
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
pmd = pmd_start + pmd_index(addr);
|
|
|
|
for (; addr < end; addr = next, pmd++) {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
|
|
|
|
if (!pmd_present(*pmd))
|
|
|
|
continue;
|
|
|
|
|
2024-03-05 04:37:42 +00:00
|
|
|
if (pmd_leaf(*pmd)) {
|
2023-07-24 19:07:56 +00:00
|
|
|
if (IS_ALIGNED(addr, PMD_SIZE) &&
|
|
|
|
IS_ALIGNED(next, PMD_SIZE)) {
|
|
|
|
if (!direct)
|
|
|
|
free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
|
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pmd);
|
|
|
|
pages++;
|
2020-07-09 13:19:24 +00:00
|
|
|
}
|
2023-07-24 19:07:56 +00:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
|
|
|
|
free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
|
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pmd);
|
|
|
|
}
|
|
|
|
#endif
|
2017-01-16 19:07:45 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
|
2023-07-24 19:07:56 +00:00
|
|
|
remove_pte_table(pte_base, addr, next, direct, altmap);
|
2017-01-16 19:07:45 +00:00
|
|
|
free_pte_table(pte_base, pmd);
|
|
|
|
}
|
2023-06-16 11:08:13 +00:00
|
|
|
if (direct)
|
|
|
|
update_page_count(MMU_PAGE_2M, -pages);
|
2017-01-16 19:07:45 +00:00
|
|
|
}
|
|
|
|
|
2020-07-29 13:37:41 +00:00
|
|
|
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
|
2023-07-24 19:07:56 +00:00
|
|
|
unsigned long end, bool direct,
|
|
|
|
struct vmem_altmap *altmap)
|
2017-01-16 19:07:45 +00:00
|
|
|
{
|
2023-06-16 11:08:13 +00:00
|
|
|
unsigned long next, pages = 0;
|
2017-01-16 19:07:45 +00:00
|
|
|
pmd_t *pmd_base;
|
|
|
|
pud_t *pud;
|
|
|
|
|
|
|
|
pud = pud_start + pud_index(addr);
|
|
|
|
for (; addr < end; addr = next, pud++) {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
|
|
|
|
if (!pud_present(*pud))
|
|
|
|
continue;
|
|
|
|
|
2024-03-05 04:37:42 +00:00
|
|
|
if (pud_leaf(*pud)) {
|
2020-07-09 13:19:24 +00:00
|
|
|
if (!IS_ALIGNED(addr, PUD_SIZE) ||
|
|
|
|
!IS_ALIGNED(next, PUD_SIZE)) {
|
|
|
|
WARN_ONCE(1, "%s: unaligned range\n", __func__);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pud);
|
2023-06-16 11:08:13 +00:00
|
|
|
pages++;
|
2017-01-16 19:07:45 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-07-08 01:09:53 +00:00
|
|
|
pmd_base = pud_pgtable(*pud);
|
2023-07-24 19:07:56 +00:00
|
|
|
remove_pmd_table(pmd_base, addr, next, direct, altmap);
|
2017-01-16 19:07:45 +00:00
|
|
|
free_pmd_table(pmd_base, pud);
|
|
|
|
}
|
2023-06-16 11:08:13 +00:00
|
|
|
if (direct)
|
|
|
|
update_page_count(MMU_PAGE_1G, -pages);
|
2017-01-16 19:07:45 +00:00
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:56 +00:00
|
|
|
static void __meminit
|
|
|
|
remove_pagetable(unsigned long start, unsigned long end, bool direct,
|
|
|
|
struct vmem_altmap *altmap)
|
2017-01-16 19:07:45 +00:00
|
|
|
{
|
|
|
|
unsigned long addr, next;
|
|
|
|
pud_t *pud_base;
|
|
|
|
pgd_t *pgd;
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d_t *p4d;
|
2017-01-16 19:07:45 +00:00
|
|
|
|
|
|
|
spin_lock(&init_mm.page_table_lock);
|
|
|
|
|
|
|
|
for (addr = start; addr < end; addr = next) {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
|
|
|
|
pgd = pgd_offset_k(addr);
|
2020-06-04 23:46:44 +00:00
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
|
|
if (!p4d_present(*p4d))
|
2017-01-16 19:07:45 +00:00
|
|
|
continue;
|
|
|
|
|
2024-03-05 04:37:42 +00:00
|
|
|
if (p4d_leaf(*p4d)) {
|
2020-07-09 13:19:24 +00:00
|
|
|
if (!IS_ALIGNED(addr, P4D_SIZE) ||
|
|
|
|
!IS_ALIGNED(next, P4D_SIZE)) {
|
|
|
|
WARN_ONCE(1, "%s: unaligned range\n", __func__);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pte_clear(&init_mm, addr, (pte_t *)pgd);
|
2017-01-16 19:07:45 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-07-08 01:09:56 +00:00
|
|
|
pud_base = p4d_pgtable(*p4d);
|
2023-07-24 19:07:56 +00:00
|
|
|
remove_pud_table(pud_base, addr, next, direct, altmap);
|
2020-07-09 13:19:23 +00:00
|
|
|
free_pud_table(pud_base, p4d);
|
2017-01-16 19:07:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_unlock(&init_mm.page_table_lock);
|
|
|
|
radix__flush_tlb_kernel_range(start, end);
|
|
|
|
}
|
|
|
|
|
2020-04-10 21:33:32 +00:00
|
|
|
int __meminit radix__create_section_mapping(unsigned long start,
|
|
|
|
unsigned long end, int nid,
|
|
|
|
pgprot_t prot)
|
2017-01-16 19:07:44 +00:00
|
|
|
{
|
2019-04-17 12:59:15 +00:00
|
|
|
if (end >= RADIX_VMALLOC_START) {
|
2019-04-23 15:10:17 +00:00
|
|
|
pr_warn("Outside the supported range\n");
|
2019-04-17 12:59:15 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2020-07-09 13:19:25 +00:00
|
|
|
return create_physical_mapping(__pa(start), __pa(end),
|
2022-09-26 07:57:23 +00:00
|
|
|
nid, prot);
|
2017-01-16 19:07:44 +00:00
|
|
|
}
|
2017-01-16 19:07:45 +00:00
|
|
|
|
2018-03-09 20:45:58 +00:00
|
|
|
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
|
2017-01-16 19:07:45 +00:00
|
|
|
{
|
2023-07-24 19:07:56 +00:00
|
|
|
remove_pagetable(start, end, true, NULL);
|
2017-01-16 19:07:45 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2017-01-16 19:07:44 +00:00
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
|
|
|
|
2016-04-29 13:26:00 +00:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
2018-02-13 15:08:22 +00:00
|
|
|
static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
|
|
|
|
pgprot_t flags, unsigned int map_page_size,
|
|
|
|
int nid)
|
|
|
|
{
|
|
|
|
return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
|
|
|
|
}
|
|
|
|
|
2016-04-29 13:26:00 +00:00
|
|
|
int __meminit radix__vmemmap_create_mapping(unsigned long start,
|
|
|
|
unsigned long page_size,
|
|
|
|
unsigned long phys)
|
|
|
|
{
|
|
|
|
/* Create a PTE encoding */
|
2018-02-13 15:08:24 +00:00
|
|
|
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
|
|
|
|
int ret;
|
|
|
|
|
2019-04-17 12:59:15 +00:00
|
|
|
if ((start + page_size) >= RADIX_VMEMMAP_END) {
|
2019-04-23 15:10:17 +00:00
|
|
|
pr_warn("Outside the supported range\n");
|
2019-04-17 12:59:15 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2023-06-16 11:08:14 +00:00
|
|
|
ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
|
2018-02-13 15:08:24 +00:00
|
|
|
BUG_ON(ret);
|
2016-04-29 13:26:00 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:57 +00:00
|
|
|
|
|
|
|
bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
|
|
|
|
{
|
|
|
|
if (radix_enabled())
|
|
|
|
return __vmemmap_can_optimize(altmap, pgmap);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:56 +00:00
|
|
|
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
|
|
|
|
unsigned long addr, unsigned long next)
|
|
|
|
{
|
2024-03-05 04:37:47 +00:00
|
|
|
int large = pmd_leaf(*pmdp);
|
2023-07-24 19:07:56 +00:00
|
|
|
|
|
|
|
if (large)
|
|
|
|
vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
|
|
|
|
|
|
|
|
return large;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
|
|
|
|
unsigned long addr, unsigned long next)
|
|
|
|
{
|
|
|
|
pte_t entry;
|
|
|
|
pte_t *ptep = pmdp_ptep(pmdp);
|
|
|
|
|
|
|
|
VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
|
|
|
|
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
|
|
|
|
set_pte_at(&init_mm, addr, ptep, entry);
|
|
|
|
asm volatile("ptesync": : :"memory");
|
|
|
|
|
|
|
|
vmemmap_verify(ptep, node, addr, next);
|
|
|
|
}
|
|
|
|
|
|
|
|
static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
|
|
|
|
int node,
|
|
|
|
struct vmem_altmap *altmap,
|
|
|
|
struct page *reuse)
|
|
|
|
{
|
|
|
|
pte_t *pte = pte_offset_kernel(pmdp, addr);
|
|
|
|
|
|
|
|
if (pte_none(*pte)) {
|
|
|
|
pte_t entry;
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
if (!reuse) {
|
|
|
|
/*
|
|
|
|
* make sure we don't create altmap mappings
|
|
|
|
* covering things outside the device.
|
|
|
|
*/
|
|
|
|
if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
|
|
|
|
altmap = NULL;
|
|
|
|
|
|
|
|
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
|
|
|
|
if (!p && altmap)
|
|
|
|
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
|
|
|
|
if (!p)
|
|
|
|
return NULL;
|
2023-07-24 19:07:59 +00:00
|
|
|
pr_debug("PAGE_SIZE vmemmap mapping\n");
|
2023-07-24 19:07:56 +00:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* When a PTE/PMD entry is freed from the init_mm
|
|
|
|
* there's a free_pages() call to this page allocated
|
|
|
|
* above. Thus this get_page() is paired with the
|
|
|
|
* put_page_testzero() on the freeing path.
|
|
|
|
* This can only called by certain ZONE_DEVICE path,
|
|
|
|
* and through vmemmap_populate_compound_pages() when
|
|
|
|
* slab is available.
|
|
|
|
*/
|
|
|
|
get_page(reuse);
|
|
|
|
p = page_to_virt(reuse);
|
2023-07-24 19:07:59 +00:00
|
|
|
pr_debug("Tail page reuse vmemmap mapping\n");
|
2023-07-24 19:07:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
VM_BUG_ON(!PAGE_ALIGNED(addr));
|
|
|
|
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
|
|
|
|
set_pte_at(&init_mm, addr, pte, entry);
|
|
|
|
asm volatile("ptesync": : :"memory");
|
|
|
|
}
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
|
|
|
|
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
|
|
|
|
if (unlikely(p4d_none(*p4dp))) {
|
|
|
|
if (unlikely(!slab_is_available())) {
|
|
|
|
pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
|
|
|
|
p4d_populate(&init_mm, p4dp, pud);
|
|
|
|
/* go to the pud_offset */
|
|
|
|
} else
|
|
|
|
return pud_alloc(&init_mm, p4dp, address);
|
|
|
|
}
|
|
|
|
return pud_offset(p4dp, address);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
|
|
|
|
if (unlikely(pud_none(*pudp))) {
|
|
|
|
if (unlikely(!slab_is_available())) {
|
|
|
|
pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
|
|
|
|
pud_populate(&init_mm, pudp, pmd);
|
|
|
|
} else
|
|
|
|
return pmd_alloc(&init_mm, pudp, address);
|
|
|
|
}
|
|
|
|
return pmd_offset(pudp, address);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
|
|
|
|
unsigned long address)
|
|
|
|
{
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
|
|
|
|
if (unlikely(pmd_none(*pmdp))) {
|
|
|
|
if (unlikely(!slab_is_available())) {
|
|
|
|
pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
|
|
|
|
pmd_populate(&init_mm, pmdp, pte);
|
|
|
|
} else
|
|
|
|
return pte_alloc_kernel(pmdp, address);
|
|
|
|
}
|
|
|
|
return pte_offset_kernel(pmdp, address);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
|
|
struct vmem_altmap *altmap)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgd;
|
|
|
|
p4d_t *p4d;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
for (addr = start; addr < end; addr = next) {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
|
|
|
|
pgd = pgd_offset_k(addr);
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
|
|
pud = vmemmap_pud_alloc(p4d, node, addr);
|
|
|
|
if (!pud)
|
|
|
|
return -ENOMEM;
|
|
|
|
pmd = vmemmap_pmd_alloc(pud, node, addr);
|
|
|
|
if (!pmd)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (pmd_none(READ_ONCE(*pmd))) {
|
|
|
|
void *p;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* keep it simple by checking addr PMD_SIZE alignment
|
|
|
|
* and verifying the device boundary condition.
|
|
|
|
* For us to use a pmd mapping, both addr and pfn should
|
|
|
|
* be aligned. We skip if addr is not aligned and for
|
|
|
|
* pfn we hope we have extra area in the altmap that
|
|
|
|
* can help to find an aligned block. This can result
|
|
|
|
* in altmap block allocation failures, in which case
|
|
|
|
* we fallback to RAM for vmemmap allocation.
|
|
|
|
*/
|
|
|
|
if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
|
|
|
|
altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
|
|
|
|
/*
|
|
|
|
* make sure we don't create altmap mappings
|
|
|
|
* covering things outside the device.
|
|
|
|
*/
|
|
|
|
goto base_mapping;
|
|
|
|
}
|
|
|
|
|
|
|
|
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
|
|
|
|
if (p) {
|
|
|
|
vmemmap_set_pmd(pmd, p, node, addr, next);
|
2023-07-24 19:07:59 +00:00
|
|
|
pr_debug("PMD_SIZE vmemmap mapping\n");
|
2023-07-24 19:07:56 +00:00
|
|
|
continue;
|
|
|
|
} else if (altmap) {
|
|
|
|
/*
|
|
|
|
* A vmemmap block allocation can fail due to
|
|
|
|
* alignment requirements and we trying to align
|
|
|
|
* things aggressively there by running out of
|
|
|
|
* space. Try base mapping on failure.
|
|
|
|
*/
|
|
|
|
goto base_mapping;
|
|
|
|
}
|
|
|
|
} else if (vmemmap_check_pmd(pmd, node, addr, next)) {
|
|
|
|
/*
|
|
|
|
* If a huge mapping exist due to early call to
|
|
|
|
* vmemmap_populate, let's try to use that.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
base_mapping:
|
|
|
|
/*
|
|
|
|
* Not able allocate higher order memory to back memmap
|
|
|
|
* or we found a pointer to pte page. Allocate base page
|
|
|
|
* size vmemmap
|
|
|
|
*/
|
|
|
|
pte = vmemmap_pte_alloc(pmd, node, addr);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
|
|
|
next = addr + PAGE_SIZE;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:57 +00:00
|
|
|
static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
|
|
|
|
struct vmem_altmap *altmap,
|
|
|
|
struct page *reuse)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
p4d_t *p4d;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pgd = pgd_offset_k(addr);
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
|
|
pud = vmemmap_pud_alloc(p4d, node, addr);
|
|
|
|
if (!pud)
|
|
|
|
return NULL;
|
|
|
|
pmd = vmemmap_pmd_alloc(pud, node, addr);
|
|
|
|
if (!pmd)
|
|
|
|
return NULL;
|
|
|
|
if (pmd_leaf(*pmd))
|
|
|
|
/*
|
|
|
|
* The second page is mapped as a hugepage due to a nearby request.
|
|
|
|
* Force our mapping to page size without deduplication
|
|
|
|
*/
|
|
|
|
return NULL;
|
|
|
|
pte = vmemmap_pte_alloc(pmd, node, addr);
|
|
|
|
if (!pte)
|
|
|
|
return NULL;
|
|
|
|
radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
|
|
|
|
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
|
|
|
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
|
|
|
static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
|
|
|
|
unsigned long pfn_offset, int node)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
p4d_t *p4d;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
unsigned long map_addr;
|
|
|
|
|
|
|
|
/* the second vmemmap page which we use for duplication */
|
|
|
|
map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
|
|
|
|
pgd = pgd_offset_k(map_addr);
|
|
|
|
p4d = p4d_offset(pgd, map_addr);
|
|
|
|
pud = vmemmap_pud_alloc(p4d, node, map_addr);
|
|
|
|
if (!pud)
|
|
|
|
return NULL;
|
|
|
|
pmd = vmemmap_pmd_alloc(pud, node, map_addr);
|
|
|
|
if (!pmd)
|
|
|
|
return NULL;
|
|
|
|
if (pmd_leaf(*pmd))
|
|
|
|
/*
|
|
|
|
* The second page is mapped as a hugepage due to a nearby request.
|
|
|
|
* Force our mapping to page size without deduplication
|
|
|
|
*/
|
|
|
|
return NULL;
|
|
|
|
pte = vmemmap_pte_alloc(pmd, node, map_addr);
|
|
|
|
if (!pte)
|
|
|
|
return NULL;
|
|
|
|
/*
|
|
|
|
* Check if there exist a mapping to the left
|
|
|
|
*/
|
|
|
|
if (pte_none(*pte)) {
|
|
|
|
/*
|
|
|
|
* Populate the head page vmemmap page.
|
|
|
|
* It can fall in different pmd, hence
|
|
|
|
* vmemmap_populate_address()
|
|
|
|
*/
|
|
|
|
pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
|
|
|
|
if (!pte)
|
|
|
|
return NULL;
|
|
|
|
/*
|
|
|
|
* Populate the tail pages vmemmap page
|
|
|
|
*/
|
|
|
|
pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
|
|
|
|
if (!pte)
|
|
|
|
return NULL;
|
|
|
|
vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
|
|
|
int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
|
|
|
|
unsigned long start,
|
|
|
|
unsigned long end, int node,
|
|
|
|
struct dev_pagemap *pgmap)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* we want to map things as base page size mapping so that
|
|
|
|
* we can save space in vmemmap. We could have huge mapping
|
|
|
|
* covering out both edges.
|
|
|
|
*/
|
|
|
|
unsigned long addr;
|
|
|
|
unsigned long addr_pfn = start_pfn;
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgd;
|
|
|
|
p4d_t *p4d;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
for (addr = start; addr < end; addr = next) {
|
|
|
|
|
|
|
|
pgd = pgd_offset_k(addr);
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
|
|
pud = vmemmap_pud_alloc(p4d, node, addr);
|
|
|
|
if (!pud)
|
|
|
|
return -ENOMEM;
|
|
|
|
pmd = vmemmap_pmd_alloc(pud, node, addr);
|
|
|
|
if (!pmd)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (pmd_leaf(READ_ONCE(*pmd))) {
|
|
|
|
/* existing huge mapping. Skip the range */
|
|
|
|
addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
pte = vmemmap_pte_alloc(pmd, node, addr);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
if (!pte_none(*pte)) {
|
|
|
|
/*
|
|
|
|
* This could be because we already have a compound
|
|
|
|
* page whose VMEMMAP_RESERVE_NR pages were mapped and
|
|
|
|
* this request fall in those pages.
|
|
|
|
*/
|
|
|
|
addr_pfn += 1;
|
|
|
|
next = addr + PAGE_SIZE;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
|
|
|
|
unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
|
|
|
|
pte_t *tail_page_pte;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* if the address is aligned to huge page size it is the
|
|
|
|
* head mapping.
|
|
|
|
*/
|
|
|
|
if (pfn_offset == 0) {
|
|
|
|
/* Populate the head page vmemmap page */
|
|
|
|
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Populate the tail pages vmemmap page
|
|
|
|
* It can fall in different pmd, hence
|
|
|
|
* vmemmap_populate_address()
|
|
|
|
*/
|
|
|
|
pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
addr_pfn += 2;
|
|
|
|
next = addr + 2 * PAGE_SIZE;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* get the 2nd mapping details
|
|
|
|
* Also create it if that doesn't exist
|
|
|
|
*/
|
|
|
|
tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
|
|
|
|
if (!tail_page_pte) {
|
|
|
|
|
|
|
|
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
|
|
|
|
|
|
|
addr_pfn += 1;
|
|
|
|
next = addr + PAGE_SIZE;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
|
|
|
|
if (!pte)
|
|
|
|
return -ENOMEM;
|
|
|
|
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
|
|
|
|
|
|
|
addr_pfn += 1;
|
|
|
|
next = addr + PAGE_SIZE;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-04-29 13:26:00 +00:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2018-03-09 20:45:58 +00:00
|
|
|
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
|
2016-04-29 13:26:00 +00:00
|
|
|
{
|
2023-07-24 19:07:56 +00:00
|
|
|
remove_pagetable(start, start + page_size, true, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
|
|
|
|
struct vmem_altmap *altmap)
|
|
|
|
{
|
|
|
|
remove_pagetable(start, end, false, altmap);
|
2016-04-29 13:26:00 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
2016-04-29 13:26:30 +00:00
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
|
|
|
|
unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pmd_t *pmdp, unsigned long clr,
|
|
|
|
unsigned long set)
|
|
|
|
{
|
|
|
|
unsigned long old;
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2017-06-28 01:32:34 +00:00
|
|
|
WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
|
2018-04-16 11:27:16 +00:00
|
|
|
assert_spin_locked(pmd_lockptr(mm, pmdp));
|
2016-04-29 13:26:30 +00:00
|
|
|
#endif
|
|
|
|
|
2023-06-16 11:08:11 +00:00
|
|
|
old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
|
2023-07-24 19:07:54 +00:00
|
|
|
trace_hugepage_update_pmd(addr, old, clr, set);
|
2016-04-29 13:26:30 +00:00
|
|
|
|
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:55 +00:00
|
|
|
unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pud_t *pudp, unsigned long clr,
|
|
|
|
unsigned long set)
|
|
|
|
{
|
|
|
|
unsigned long old;
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
WARN_ON(!pud_devmap(*pudp));
|
|
|
|
assert_spin_locked(pud_lockptr(mm, pudp));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
|
|
|
|
trace_hugepage_update_pud(addr, old, clr, set);
|
|
|
|
|
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
2016-04-29 13:26:30 +00:00
|
|
|
pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
|
|
|
|
pmd_t *pmdp)
|
|
|
|
|
|
|
|
{
|
|
|
|
pmd_t pmd;
|
|
|
|
|
|
|
|
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
|
|
|
VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
|
2017-06-28 01:32:34 +00:00
|
|
|
VM_BUG_ON(pmd_devmap(*pmdp));
|
2016-04-29 13:26:30 +00:00
|
|
|
/*
|
|
|
|
* khugepaged calls this for normal pmd
|
|
|
|
*/
|
|
|
|
pmd = *pmdp;
|
|
|
|
pmd_clear(pmdp);
|
2017-07-19 04:49:06 +00:00
|
|
|
|
|
|
|
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
|
|
|
|
|
2016-04-29 13:26:30 +00:00
|
|
|
return pmd;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For us pgtable_t is pte_t *. Inorder to save the deposisted
|
|
|
|
* page table, we consider the allocated page table as a list
|
|
|
|
* head. On withdraw we need to make sure we zero out the used
|
|
|
|
* list_head memory area.
|
|
|
|
*/
|
|
|
|
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
|
|
|
|
pgtable_t pgtable)
|
|
|
|
{
|
2019-03-29 10:00:00 +00:00
|
|
|
struct list_head *lh = (struct list_head *) pgtable;
|
2016-04-29 13:26:30 +00:00
|
|
|
|
2019-03-29 10:00:00 +00:00
|
|
|
assert_spin_locked(pmd_lockptr(mm, pmdp));
|
2016-04-29 13:26:30 +00:00
|
|
|
|
2019-03-29 10:00:00 +00:00
|
|
|
/* FIFO */
|
|
|
|
if (!pmd_huge_pte(mm, pmdp))
|
|
|
|
INIT_LIST_HEAD(lh);
|
|
|
|
else
|
|
|
|
list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
|
|
|
|
pmd_huge_pte(mm, pmdp) = pgtable;
|
2016-04-29 13:26:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
|
|
|
|
{
|
2019-03-29 10:00:00 +00:00
|
|
|
pte_t *ptep;
|
|
|
|
pgtable_t pgtable;
|
|
|
|
struct list_head *lh;
|
2016-04-29 13:26:30 +00:00
|
|
|
|
2019-03-29 10:00:00 +00:00
|
|
|
assert_spin_locked(pmd_lockptr(mm, pmdp));
|
|
|
|
|
|
|
|
/* FIFO */
|
|
|
|
pgtable = pmd_huge_pte(mm, pmdp);
|
|
|
|
lh = (struct list_head *) pgtable;
|
|
|
|
if (list_empty(lh))
|
|
|
|
pmd_huge_pte(mm, pmdp) = NULL;
|
|
|
|
else {
|
|
|
|
pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
|
|
|
|
list_del(lh);
|
|
|
|
}
|
|
|
|
ptep = (pte_t *) pgtable;
|
|
|
|
*ptep = __pte(0);
|
|
|
|
ptep++;
|
|
|
|
*ptep = __pte(0);
|
|
|
|
return pgtable;
|
|
|
|
}
|
2016-04-29 13:26:30 +00:00
|
|
|
|
|
|
|
pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
|
2019-03-29 10:00:00 +00:00
|
|
|
unsigned long addr, pmd_t *pmdp)
|
2016-04-29 13:26:30 +00:00
|
|
|
{
|
|
|
|
pmd_t old_pmd;
|
|
|
|
unsigned long old;
|
|
|
|
|
|
|
|
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
|
|
|
|
old_pmd = __pmd(old);
|
|
|
|
return old_pmd;
|
|
|
|
}
|
|
|
|
|
2023-07-24 19:07:55 +00:00
|
|
|
pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
|
|
|
|
unsigned long addr, pud_t *pudp)
|
|
|
|
{
|
|
|
|
pud_t old_pud;
|
|
|
|
unsigned long old;
|
|
|
|
|
|
|
|
old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
|
|
|
|
old_pud = __pud(old);
|
|
|
|
return old_pud;
|
|
|
|
}
|
|
|
|
|
2016-04-29 13:26:30 +00:00
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
2018-05-29 14:28:39 +00:00
|
|
|
|
2018-05-29 14:28:40 +00:00
|
|
|
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
|
|
|
|
pte_t entry, unsigned long address, int psize)
|
2018-05-29 14:28:39 +00:00
|
|
|
{
|
2018-05-29 14:28:40 +00:00
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
powerpc/64s/radix: Fix soft dirty tracking
It was reported that soft dirty tracking doesn't work when using the
Radix MMU.
The tracking is supposed to work by clearing the soft dirty bit for a
mapping and then write protecting the PTE. If/when the page is written
to, a page fault occurs and the soft dirty bit is added back via
pte_mkdirty(). For example in wp_page_reuse():
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
Unfortunately on radix _PAGE_SOFTDIRTY is being dropped by
radix__ptep_set_access_flags(), called from ptep_set_access_flags(),
meaning the soft dirty bit is not set even though the page has been
written to.
Fix it by adding _PAGE_SOFTDIRTY to the set of bits that are able to be
changed in radix__ptep_set_access_flags().
Fixes: b0b5e9b13047 ("powerpc/mm/radix: Add radix pte #defines")
Cc: stable@vger.kernel.org # v4.7+
Reported-by: Dan Horák <dan@danny.cz>
Link: https://lore.kernel.org/r/20230511095558.56663a50f86bdc4cd97700b7@danny.cz
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20230511114224.977423-1-mpe@ellerman.id.au
2023-05-11 11:42:24 +00:00
|
|
|
unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
|
|
|
|
_PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
|
2018-08-22 17:16:05 +00:00
|
|
|
|
|
|
|
unsigned long change = pte_val(entry) ^ pte_val(*ptep);
|
2018-05-29 14:28:41 +00:00
|
|
|
/*
|
2022-05-25 02:23:57 +00:00
|
|
|
* On POWER9, the NMMU is not able to relax PTE access permissions
|
|
|
|
* for a translation with a TLB. The PTE must be invalidated, TLB
|
|
|
|
* flushed before the new PTE is installed.
|
|
|
|
*
|
|
|
|
* This only needs to be done for radix, because hash translation does
|
|
|
|
* flush when updating the linux pte (and we don't support NMMU
|
|
|
|
* accelerators on HPT on POWER9 anyway XXX: do we?).
|
|
|
|
*
|
|
|
|
* POWER10 (and P9P) NMMU does behave as per ISA.
|
2018-05-29 14:28:41 +00:00
|
|
|
*/
|
2022-05-25 02:23:57 +00:00
|
|
|
if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
|
|
|
|
atomic_read(&mm->context.copros) > 0) {
|
2018-05-29 14:28:39 +00:00
|
|
|
unsigned long old_pte, new_pte;
|
|
|
|
|
2018-08-22 17:16:05 +00:00
|
|
|
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
|
2018-05-29 14:28:39 +00:00
|
|
|
new_pte = old_pte | set;
|
2018-05-29 14:28:41 +00:00
|
|
|
radix__flush_tlb_page_psize(mm, address, psize);
|
2018-08-22 17:16:05 +00:00
|
|
|
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
|
2018-05-29 14:28:41 +00:00
|
|
|
} else {
|
2018-05-29 14:28:39 +00:00
|
|
|
__radix_pte_update(ptep, 0, set);
|
2018-06-01 10:01:15 +00:00
|
|
|
/*
|
|
|
|
* Book3S does not require a TLB flush when relaxing access
|
2022-05-25 02:23:57 +00:00
|
|
|
* restrictions when the address space (modulo the POWER9 nest
|
|
|
|
* MMU issue above) because the MMU will reload the PTE after
|
|
|
|
* taking an access fault, as defined by the architecture. See
|
|
|
|
* "Setting a Reference or Change Bit or Upgrading Access
|
|
|
|
* Authority (PTE Subject to Atomic Hardware Updates)" in
|
|
|
|
* Power ISA Version 3.1B.
|
2018-06-01 10:01:15 +00:00
|
|
|
*/
|
2018-05-29 14:28:41 +00:00
|
|
|
}
|
2018-06-01 10:01:19 +00:00
|
|
|
/* See ptesync comment in radix__set_pte_at */
|
2018-05-29 14:28:39 +00:00
|
|
|
}
|
2019-03-05 23:46:33 +00:00
|
|
|
|
|
|
|
void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pte_t *ptep,
|
|
|
|
pte_t old_pte, pte_t pte)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
|
|
|
|
/*
|
2022-05-25 02:23:57 +00:00
|
|
|
* POWER9 NMMU must flush the TLB after clearing the PTE before
|
|
|
|
* installing a PTE with more relaxed access permissions, see
|
|
|
|
* radix__ptep_set_access_flags.
|
2019-03-05 23:46:33 +00:00
|
|
|
*/
|
2022-05-25 02:23:57 +00:00
|
|
|
if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
|
|
|
|
is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
|
2019-03-05 23:46:33 +00:00
|
|
|
(atomic_read(&mm->context.copros) > 0))
|
|
|
|
radix__flush_tlb_page(vma, addr);
|
|
|
|
|
|
|
|
set_pte_at(mm, addr, ptep, pte);
|
|
|
|
}
|
2019-06-10 03:08:17 +00:00
|
|
|
|
2019-06-10 03:08:18 +00:00
|
|
|
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pte_t *ptep = (pte_t *)pud;
|
|
|
|
pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
|
|
|
|
|
|
|
|
if (!radix_enabled())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pud_clear_huge(pud_t *pud)
|
|
|
|
{
|
2024-03-05 04:37:42 +00:00
|
|
|
if (pud_leaf(*pud)) {
|
2019-06-10 03:08:18 +00:00
|
|
|
pud_clear(pud);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
int i;
|
|
|
|
|
2021-07-08 01:09:53 +00:00
|
|
|
pmd = pud_pgtable(*pud);
|
2019-06-10 03:08:18 +00:00
|
|
|
pud_clear(pud);
|
|
|
|
|
|
|
|
flush_tlb_kernel_range(addr, addr + PUD_SIZE);
|
|
|
|
|
|
|
|
for (i = 0; i < PTRS_PER_PMD; i++) {
|
|
|
|
if (!pmd_none(pmd[i])) {
|
|
|
|
pte_t *pte;
|
|
|
|
pte = (pte_t *)pmd_page_vaddr(pmd[i]);
|
|
|
|
|
|
|
|
pte_free_kernel(&init_mm, pte);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_free(&init_mm, pmd);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pte_t *ptep = (pte_t *)pmd;
|
|
|
|
pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
|
|
|
|
|
|
|
|
if (!radix_enabled())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pmd_clear_huge(pmd_t *pmd)
|
|
|
|
{
|
2024-03-05 04:37:42 +00:00
|
|
|
if (pmd_leaf(*pmd)) {
|
2019-06-10 03:08:18 +00:00
|
|
|
pmd_clear(pmd);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
|
|
|
|
{
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pte = (pte_t *)pmd_page_vaddr(*pmd);
|
|
|
|
pmd_clear(pmd);
|
|
|
|
|
|
|
|
flush_tlb_kernel_range(addr, addr + PMD_SIZE);
|
|
|
|
|
|
|
|
pte_free_kernel(&init_mm, pte);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|