2019-06-03 05:44:50 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
*/
|
|
|
|
#ifndef __ASM_PGTABLE_H
|
|
|
|
#define __ASM_PGTABLE_H
|
|
|
|
|
2015-07-10 16:24:28 +00:00
|
|
|
#include <asm/bug.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <asm/proc-fns.h>
|
|
|
|
|
|
|
|
#include <asm/memory.h>
|
2020-05-04 13:42:36 +00:00
|
|
|
#include <asm/mte.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
#include <asm/pgtable-hwdef.h>
|
arm64: Remove fixmap include fragility
The asm-generic fixmap.h depends on each architecture's fixmap.h to pull
in the definition of PAGE_KERNEL_RO, if this exists. In the absence of
this, FIXMAP_PAGE_RO will not be defined. In mm/early_ioremap.c the
definition of early_memremap_ro is predicated on FIXMAP_PAGE_RO being
defined.
Currently, the arm64 fixmap.h doesn't include pgtable.h for the
definition of PAGE_KERNEL_RO, and as a knock-on effect early_memremap_ro
is not always defined, leading to link-time failures when it is used.
This has been observed with defconfig on next-20160226.
Unfortunately, as pgtable.h includes fixmap.h, adding the include
introduces a circular dependency, which is just as fragile.
Instead, this patch factors out PAGE_KERNEL_RO and other prot
definitions into a new pgtable-prot header which can be included by poth
pgtable.h and fixmap.h, avoiding the circular dependency, and ensuring
that early_memremap_ro is alwyas defined where it is used.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-02-26 14:31:32 +00:00
|
|
|
#include <asm/pgtable-prot.h>
|
2018-10-29 09:25:58 +00:00
|
|
|
#include <asm/tlbflush.h>
|
2012-03-05 11:49:27 +00:00
|
|
|
|
|
|
|
/*
|
2016-03-30 14:46:00 +00:00
|
|
|
* VMALLOC range.
|
2014-07-16 16:42:43 +00:00
|
|
|
*
|
2016-02-16 12:52:40 +00:00
|
|
|
* VMALLOC_START: beginning of the kernel vmalloc space
|
2023-12-13 08:40:31 +00:00
|
|
|
* VMALLOC_END: extends to the available space below vmemmap
|
2012-03-05 11:49:27 +00:00
|
|
|
*/
|
2016-02-16 12:52:40 +00:00
|
|
|
#define VMALLOC_START (MODULES_END)
|
2023-12-13 08:40:31 +00:00
|
|
|
#if VA_BITS == VA_BITS_MIN
|
2023-12-13 08:40:27 +00:00
|
|
|
#define VMALLOC_END (VMEMMAP_START - SZ_8M)
|
2023-12-13 08:40:31 +00:00
|
|
|
#else
|
|
|
|
#define VMEMMAP_UNUSED_NPAGES ((_PAGE_OFFSET(vabits_actual) - PAGE_OFFSET) >> PAGE_SHIFT)
|
|
|
|
#define VMALLOC_END (VMEMMAP_START + VMEMMAP_UNUSED_NPAGES * sizeof(struct page) - SZ_8M)
|
|
|
|
#endif
|
2012-03-05 11:49:27 +00:00
|
|
|
|
arm64: mm: use single quantity to represent the PA to VA translation
On arm64, the global variable memstart_addr represents the physical
address of PAGE_OFFSET, and so physical to virtual translations or
vice versa used to come down to simple additions or subtractions
involving the values of PAGE_OFFSET and memstart_addr.
When support for 52-bit virtual addressing was introduced, we had to
deal with PAGE_OFFSET potentially being outside of the region that
can be covered by the virtual range (as the 52-bit VA capable build
needs to be able to run on systems that are only 48-bit VA capable),
and for this reason, another translation was introduced, and recorded
in the global variable physvirt_offset.
However, if we go back to the original definition of memstart_addr,
i.e., the physical address of PAGE_OFFSET, it turns out that there is
no need for two separate translations: instead, we can simply subtract
the size of the unaddressable VA space from memstart_addr to make the
available physical memory appear in the 48-bit addressable VA region.
This simplifies things, but also fixes a bug on KASLR builds, which
may update memstart_addr later on in arm64_memblock_init(), but fails
to update vmemmap and physvirt_offset accordingly.
Fixes: 5383cc6efed1 ("arm64: mm: Introduce vabits_actual")
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Link: https://lore.kernel.org/r/20201008153602.9467-2-ardb@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2020-10-08 15:35:59 +00:00
|
|
|
#define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
#ifndef __ASSEMBLY__
|
2015-07-10 16:24:28 +00:00
|
|
|
|
2017-06-26 13:27:36 +00:00
|
|
|
#include <asm/cmpxchg.h>
|
2016-01-25 11:45:07 +00:00
|
|
|
#include <asm/fixmap.h>
|
2015-07-10 16:24:28 +00:00
|
|
|
#include <linux/mmdebug.h>
|
2017-12-12 10:48:54 +00:00
|
|
|
#include <linux/mm_types.h>
|
|
|
|
#include <linux/sched.h>
|
2022-05-13 03:23:06 +00:00
|
|
|
#include <linux/page_table_check.h>
|
2015-07-10 16:24:28 +00:00
|
|
|
|
2020-06-25 08:03:14 +00:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
|
|
|
|
|
|
|
|
/* Set stride and tlb_level in flush_*_tlb_range */
|
|
|
|
#define flush_pmd_tlb_range(vma, addr, end) \
|
|
|
|
__flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
|
|
|
|
#define flush_pud_tlb_range(vma, addr, end) \
|
|
|
|
__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
arm64: enable THP_SWAP for arm64
THP_SWAP has been proven to improve the swap throughput significantly
on x86_64 according to commit bd4c82c22c367e ("mm, THP, swap: delay
splitting THP after swapped out").
As long as arm64 uses 4K page size, it is quite similar with x86_64
by having 2MB PMD THP. THP_SWAP is architecture-independent, thus,
enabling it on arm64 will benefit arm64 as well.
A corner case is that MTE has an assumption that only base pages
can be swapped. We won't enable THP_SWAP for ARM64 hardware with
MTE support until MTE is reworked to coexist with THP_SWAP.
A micro-benchmark is written to measure thp swapout throughput as
below,
unsigned long long tv_to_ms(struct timeval tv)
{
return tv.tv_sec * 1000 + tv.tv_usec / 1000;
}
main()
{
struct timeval tv_b, tv_e;;
#define SIZE 400*1024*1024
volatile void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!p) {
perror("fail to get memory");
exit(-1);
}
madvise(p, SIZE, MADV_HUGEPAGE);
memset(p, 0x11, SIZE); /* write to get mem */
gettimeofday(&tv_b, NULL);
madvise(p, SIZE, MADV_PAGEOUT);
gettimeofday(&tv_e, NULL);
printf("swp out bandwidth: %ld bytes/ms\n",
SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
}
Testing is done on rk3568 64bit Quad Core Cortex-A55 platform -
ROCK 3A.
thp swp throughput w/o patch: 2734bytes/ms (mean of 10 tests)
thp swp throughput w/ patch: 3331bytes/ms (mean of 10 tests)
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Link: https://lore.kernel.org/r/20220720093737.133375-1-21cnbao@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
2022-07-20 09:37:37 +00:00
|
|
|
static inline bool arch_thp_swp_supported(void)
|
|
|
|
{
|
|
|
|
return !system_supports_mte();
|
|
|
|
}
|
|
|
|
#define arch_thp_swp_supported arch_thp_swp_supported
|
|
|
|
|
2020-09-30 12:20:40 +00:00
|
|
|
/*
|
|
|
|
* Outside of a few very special situations (e.g. hibernation), we always
|
|
|
|
* use broadcast TLB invalidation instructions, therefore a spurious page
|
|
|
|
* fault on one CPU which has been handled concurrently by another CPU
|
|
|
|
* does not need to perform additional invalidation.
|
|
|
|
*/
|
2023-03-06 16:15:48 +00:00
|
|
|
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
|
2020-09-30 12:20:40 +00:00
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* ZERO_PAGE is a global shared page that is always zero: used
|
|
|
|
* for zero-mapped memory areas etc..
|
|
|
|
*/
|
arm64: mm: place empty_zero_page in bss
Currently the zero page is set up in paging_init, and thus we cannot use
the zero page earlier. We use the zero page as a reserved TTBR value
from which no TLB entries may be allocated (e.g. when uninstalling the
idmap). To enable such usage earlier (as may be required for invasive
changes to the kernel page tables), and to minimise the time that the
idmap is active, we need to be able to use the zero page before
paging_init.
This patch follows the example set by x86, by allocating the zero page
at compile time, in .bss. This means that the zero page itself is
available immediately upon entry to start_kernel (as we zero .bss before
this), and also means that the zero page takes up no space in the raw
Image binary. The associated struct page is allocated in bootmem_init,
and remains unavailable until this time.
Outside of arch code, the only users of empty_zero_page assume that the
empty_zero_page symbol refers to the zeroed memory itself, and that
ZERO_PAGE(x) must be used to acquire the associated struct page,
following the example of x86. This patch also brings arm64 inline with
these assumptions.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 11:44:57 +00:00
|
|
|
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
|
2017-01-10 21:35:49 +00:00
|
|
|
#define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page))
|
2012-03-05 11:49:27 +00:00
|
|
|
|
arm64/mm: Refactor {pgd, pud, pmd, pte}_ERROR()
The function __{pgd, pud, pmd, pte}_error() are introduced so that
they can be called by {pgd, pud, pmd, pte}_ERROR(). However, some
of the functions could never be called when the corresponding page
table level isn't enabled. For example, __{pud, pmd}_error() are
unused when PUD and PMD are folded to PGD.
This removes __{pgd, pud, pmd, pte}_error() and call pr_err() from
{pgd, pud, pmd, pte}_ERROR() directly, similar to what x86/powerpc
are doing. With this, the code looks a bit simplified either.
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20200913234730.23145-1-gshan@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
2020-09-13 23:47:30 +00:00
|
|
|
#define pte_ERROR(e) \
|
|
|
|
pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e))
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2017-12-13 17:07:21 +00:00
|
|
|
/*
|
|
|
|
* Macros to convert between a physical address and its placement in a
|
|
|
|
* page table entry, taking care of 52-bit addresses.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_ARM64_PA_BITS_52
|
2021-11-05 07:54:03 +00:00
|
|
|
static inline phys_addr_t __pte_to_phys(pte_t pte)
|
|
|
|
{
|
2024-02-14 12:29:16 +00:00
|
|
|
pte_val(pte) &= ~PTE_MAYBE_SHARED;
|
2021-11-05 07:54:03 +00:00
|
|
|
return (pte_val(pte) & PTE_ADDR_LOW) |
|
2022-11-07 14:17:53 +00:00
|
|
|
((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT);
|
2021-11-05 07:54:03 +00:00
|
|
|
}
|
|
|
|
static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
|
|
|
|
{
|
2024-02-14 12:29:16 +00:00
|
|
|
return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK;
|
2021-11-05 07:54:03 +00:00
|
|
|
}
|
2017-12-13 17:07:21 +00:00
|
|
|
#else
|
2024-02-14 12:29:16 +00:00
|
|
|
#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_LOW)
|
2017-12-13 17:07:21 +00:00
|
|
|
#define __phys_to_pte_val(phys) (phys)
|
|
|
|
#endif
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2017-12-13 17:07:21 +00:00
|
|
|
#define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT)
|
|
|
|
#define pfn_pte(pfn,prot) \
|
|
|
|
__pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
|
2012-03-05 11:49:27 +00:00
|
|
|
|
|
|
|
#define pte_none(pte) (!pte_val(pte))
|
|
|
|
#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0))
|
|
|
|
#define pte_page(pte) (pfn_to_page(pte_pfn(pte)))
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* The following only work if pte_present(). Undefined behaviour otherwise.
|
|
|
|
*/
|
2014-02-25 11:38:53 +00:00
|
|
|
#define pte_present(pte) (!!(pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)))
|
|
|
|
#define pte_young(pte) (!!(pte_val(pte) & PTE_AF))
|
|
|
|
#define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL))
|
|
|
|
#define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE))
|
2023-07-13 09:20:04 +00:00
|
|
|
#define pte_rdonly(pte) (!!(pte_val(pte) & PTE_RDONLY))
|
2022-05-13 03:23:06 +00:00
|
|
|
#define pte_user(pte) (!!(pte_val(pte) & PTE_USER))
|
2017-01-27 10:54:12 +00:00
|
|
|
#define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN))
|
2015-10-07 17:00:21 +00:00
|
|
|
#define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT))
|
2019-07-16 23:30:51 +00:00
|
|
|
#define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP))
|
2020-05-04 13:42:36 +00:00
|
|
|
#define pte_tagged(pte) ((pte_val(pte) & PTE_ATTRINDX_MASK) == \
|
|
|
|
PTE_ATTRINDX(MT_NORMAL_TAGGED))
|
2012-03-05 11:49:27 +00:00
|
|
|
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-09 20:52:09 +00:00
|
|
|
#define pte_cont_addr_end(addr, end) \
|
|
|
|
({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \
|
|
|
|
(__boundary - 1 < (end) - 1) ? __boundary : (end); \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define pmd_cont_addr_end(addr, end) \
|
|
|
|
({ unsigned long __boundary = ((addr) + CONT_PMD_SIZE) & CONT_PMD_MASK; \
|
|
|
|
(__boundary - 1 < (end) - 1) ? __boundary : (end); \
|
|
|
|
})
|
|
|
|
|
2023-07-13 09:20:04 +00:00
|
|
|
#define pte_hw_dirty(pte) (pte_write(pte) && !pte_rdonly(pte))
|
2015-07-10 16:24:28 +00:00
|
|
|
#define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY))
|
|
|
|
#define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte))
|
|
|
|
|
2015-07-28 15:14:03 +00:00
|
|
|
#define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID))
|
2021-03-12 17:38:10 +00:00
|
|
|
/*
|
|
|
|
* Execute-only user mappings do not have the PTE_USER bit set. All valid
|
|
|
|
* kernel mappings have the PTE_UXN bit set.
|
|
|
|
*/
|
2017-01-27 10:54:12 +00:00
|
|
|
#define pte_valid_not_user(pte) \
|
2021-03-12 17:38:10 +00:00
|
|
|
((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
|
2015-10-30 18:56:19 +00:00
|
|
|
/*
|
|
|
|
* Could the pte be present in the TLB? We must check mm_tlb_flush_pending
|
|
|
|
* so that we don't erroneously return false for pages that have been
|
|
|
|
* remapped as PROT_NONE but are yet to be flushed from the TLB.
|
2020-11-20 13:28:01 +00:00
|
|
|
* Note that we can't make any assumptions based on the state of the access
|
|
|
|
* flag, since ptep_clear_flush_young() elides a DSB when invalidating the
|
|
|
|
* TLB.
|
2015-10-30 18:56:19 +00:00
|
|
|
*/
|
|
|
|
#define pte_accessible(mm, pte) \
|
2020-11-20 13:28:01 +00:00
|
|
|
(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2017-10-26 17:36:47 +00:00
|
|
|
/*
|
2021-03-12 17:38:10 +00:00
|
|
|
* p??_access_permitted() is true for valid user mappings (PTE_USER
|
|
|
|
* bit set, subject to the write permission check). For execute-only
|
|
|
|
* mappings, like PROT_EXEC with EPAN (both PTE_USER and PTE_UXN bits
|
|
|
|
* not set) must return false. PROT_NONE mappings do not have the
|
|
|
|
* PTE_VALID bit set.
|
2017-10-26 17:36:47 +00:00
|
|
|
*/
|
|
|
|
#define pte_access_permitted(pte, write) \
|
2021-03-12 17:38:10 +00:00
|
|
|
(((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
|
2017-10-26 17:36:47 +00:00
|
|
|
#define pmd_access_permitted(pmd, write) \
|
|
|
|
(pte_access_permitted(pmd_pte(pmd), (write)))
|
|
|
|
#define pud_access_permitted(pud, write) \
|
|
|
|
(pte_access_permitted(pud_pte(pud), (write)))
|
|
|
|
|
2014-08-19 19:41:42 +00:00
|
|
|
static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
|
2014-01-15 14:07:12 +00:00
|
|
|
{
|
2014-08-19 19:41:42 +00:00
|
|
|
pte_val(pte) &= ~pgprot_val(prot);
|
2014-01-15 14:07:12 +00:00
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2014-08-19 19:41:42 +00:00
|
|
|
static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
|
2014-01-15 14:07:12 +00:00
|
|
|
{
|
2014-08-19 19:41:42 +00:00
|
|
|
pte_val(pte) |= pgprot_val(prot);
|
2014-01-15 14:07:12 +00:00
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2020-09-09 04:53:02 +00:00
|
|
|
static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pmd_val(pmd) &= ~pgprot_val(prot);
|
|
|
|
return pmd;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
|
|
|
|
{
|
|
|
|
pmd_val(pmd) |= pgprot_val(prot);
|
|
|
|
return pmd;
|
|
|
|
}
|
|
|
|
|
2023-06-13 00:10:27 +00:00
|
|
|
static inline pte_t pte_mkwrite_novma(pte_t pte)
|
2014-08-19 19:41:42 +00:00
|
|
|
{
|
2017-07-04 18:04:18 +00:00
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_WRITE));
|
|
|
|
pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
|
|
|
|
return pte;
|
2014-08-19 19:41:42 +00:00
|
|
|
}
|
|
|
|
|
2014-01-15 14:07:12 +00:00
|
|
|
static inline pte_t pte_mkclean(pte_t pte)
|
|
|
|
{
|
2017-12-01 17:22:14 +00:00
|
|
|
pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
|
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
|
|
|
|
|
|
|
|
return pte;
|
2014-01-15 14:07:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t pte_mkdirty(pte_t pte)
|
|
|
|
{
|
2017-12-01 17:22:14 +00:00
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
|
|
|
|
|
|
|
|
if (pte_write(pte))
|
|
|
|
pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
|
|
|
|
|
|
|
|
return pte;
|
2014-01-15 14:07:12 +00:00
|
|
|
}
|
|
|
|
|
2020-11-20 13:57:48 +00:00
|
|
|
static inline pte_t pte_wrprotect(pte_t pte)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
|
|
|
|
* clear), set the PTE_DIRTY bit.
|
|
|
|
*/
|
|
|
|
if (pte_hw_dirty(pte))
|
2023-07-13 07:15:18 +00:00
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
|
2020-11-20 13:57:48 +00:00
|
|
|
|
|
|
|
pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
|
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2014-01-15 14:07:12 +00:00
|
|
|
static inline pte_t pte_mkold(pte_t pte)
|
|
|
|
{
|
2014-08-19 19:41:42 +00:00
|
|
|
return clear_pte_bit(pte, __pgprot(PTE_AF));
|
2014-01-15 14:07:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t pte_mkyoung(pte_t pte)
|
|
|
|
{
|
2014-08-19 19:41:42 +00:00
|
|
|
return set_pte_bit(pte, __pgprot(PTE_AF));
|
2014-01-15 14:07:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t pte_mkspecial(pte_t pte)
|
|
|
|
{
|
2014-08-19 19:41:42 +00:00
|
|
|
return set_pte_bit(pte, __pgprot(PTE_SPECIAL));
|
2014-01-15 14:07:12 +00:00
|
|
|
}
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2015-10-07 17:00:21 +00:00
|
|
|
static inline pte_t pte_mkcont(pte_t pte)
|
|
|
|
{
|
2015-12-17 19:31:26 +00:00
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_CONT));
|
|
|
|
return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
|
2015-10-07 17:00:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t pte_mknoncont(pte_t pte)
|
|
|
|
{
|
|
|
|
return clear_pte_bit(pte, __pgprot(PTE_CONT));
|
|
|
|
}
|
|
|
|
|
2016-08-24 17:27:30 +00:00
|
|
|
static inline pte_t pte_mkpresent(pte_t pte)
|
|
|
|
{
|
|
|
|
return set_pte_bit(pte, __pgprot(PTE_VALID));
|
|
|
|
}
|
|
|
|
|
2015-12-17 19:31:26 +00:00
|
|
|
static inline pmd_t pmd_mkcont(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
|
|
|
|
}
|
|
|
|
|
2019-07-16 23:30:51 +00:00
|
|
|
static inline pte_t pte_mkdevmap(pte_t pte)
|
|
|
|
{
|
arm64: mm: add missing PTE_SPECIAL in pte_mkdevmap on arm64
Without this patch, the MAP_SYNC test case will cause a print_bad_pte
warning on arm64 as follows:
[ 25.542693] BUG: Bad page map in process mapdax333 pte:2e8000448800f53 pmd:41ff5f003
[ 25.546360] page:ffff7e0010220000 refcount:1 mapcount:-1 mapping:ffff8003e29c7440 index:0x0
[ 25.550281] ext4_dax_aops
[ 25.550282] name:"__aaabbbcccddd__"
[ 25.551553] flags: 0x3ffff0000001002(referenced|reserved)
[ 25.555802] raw: 03ffff0000001002 ffff8003dfffa908 0000000000000000 ffff8003e29c7440
[ 25.559446] raw: 0000000000000000 0000000000000000 00000001fffffffe 0000000000000000
[ 25.563075] page dumped because: bad pte
[ 25.564938] addr:0000ffffbe05b000 vm_flags:208000fb anon_vma:0000000000000000 mapping:ffff8003e29c7440 index:0
[ 25.574272] file:__aaabbbcccddd__ fault:ext4_dax_fault mmmmap:ext4_file_mmap readpage:0x0
[ 25.578799] CPU: 1 PID: 1180 Comm: mapdax333 Not tainted 5.2.0+ #21
[ 25.581702] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 25.585624] Call trace:
[ 25.587008] dump_backtrace+0x0/0x178
[ 25.588799] show_stack+0x24/0x30
[ 25.590328] dump_stack+0xa8/0xcc
[ 25.591901] print_bad_pte+0x18c/0x218
[ 25.593628] unmap_page_range+0x778/0xc00
[ 25.595506] unmap_single_vma+0x94/0xe8
[ 25.597304] unmap_vmas+0x90/0x108
[ 25.598901] unmap_region+0xc0/0x128
[ 25.600566] __do_munmap+0x284/0x3f0
[ 25.602245] __vm_munmap+0x78/0xe0
[ 25.603820] __arm64_sys_munmap+0x34/0x48
[ 25.605709] el0_svc_common.constprop.0+0x78/0x168
[ 25.607956] el0_svc_handler+0x34/0x90
[ 25.609698] el0_svc+0x8/0xc
[...]
The root cause is in _vm_normal_page, without the PTE_SPECIAL bit,
the return value will be incorrectly set to pfn_to_page(pfn) instead
of NULL. Besides, this patch also rewrite the pmd_mkdevmap to avoid
setting PTE_SPECIAL for pmd
The MAP_SYNC test case is as follows(Provided by Yibo Cai)
$#include <stdio.h>
$#include <string.h>
$#include <unistd.h>
$#include <sys/file.h>
$#include <sys/mman.h>
$#ifndef MAP_SYNC
$#define MAP_SYNC 0x80000
$#endif
/* mount -o dax /dev/pmem0 /mnt */
$#define F "/mnt/__aaabbbcccddd__"
int main(void)
{
int fd;
char buf[4096];
void *addr;
if ((fd = open(F, O_CREAT|O_TRUNC|O_RDWR, 0644)) < 0) {
perror("open1");
return 1;
}
if (write(fd, buf, 4096) != 4096) {
perror("lseek");
return 1;
}
addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_SYNC, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap");
printf("did you mount with '-o dax'?\n");
return 1;
}
memset(addr, 0x55, 4096);
if (munmap(addr, 4096) == -1) {
perror("munmap");
return 1;
}
close(fd);
return 0;
}
Fixes: 73b20c84d42d ("arm64: mm: implement pte_devmap support")
Reported-by: Yibo Cai <Yibo.Cai@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Robin Murphy <Robin.Murphy@arm.com>
Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-08-07 04:58:51 +00:00
|
|
|
return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
|
2019-07-16 23:30:51 +00:00
|
|
|
}
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
static inline void set_pte(pte_t *ptep, pte_t pte)
|
|
|
|
{
|
2018-02-15 11:14:56 +00:00
|
|
|
WRITE_ONCE(*ptep, pte);
|
2014-06-09 10:55:03 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only if the new pte is valid and kernel, otherwise TLB maintenance
|
|
|
|
* or update_mmu_cache() have the necessary barriers.
|
|
|
|
*/
|
Revert "arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}"
This reverts commit 24fe1b0efad4fcdd32ce46cffeab297f22581707.
Commit 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from
set_{pte,pmd,pud}") removed ISB instructions immediately following updates
to the page table, on the grounds that they are not required by the
architecture and a DSB alone is sufficient to ensure that subsequent data
accesses use the new translation:
DDI0487E_a, B2-128:
| ... no instruction that appears in program order after the DSB
| instruction can alter any state of the system or perform any part of
| its functionality until the DSB completes other than:
|
| * Being fetched from memory and decoded
| * Reading the general-purpose, SIMD and floating-point,
| Special-purpose, or System registers that are directly or indirectly
| read without causing side-effects.
However, the same document also states the following:
DDI0487E_a, B2-125:
| DMB and DSB instructions affect reads and writes to the memory system
| generated by Load/Store instructions and data or unified cache
| maintenance instructions being executed by the PE. Instruction fetches
| or accesses caused by a hardware translation table access are not
| explicit accesses.
which appears to claim that the DSB alone is insufficient. Unfortunately,
some CPU designers have followed the second clause above, whereas in Linux
we've been relying on the first. This means that our mapping sequence:
MOV X0, <valid pte>
STR X0, [Xptep] // Store new PTE to page table
DSB ISHST
LDR X1, [X2] // Translates using the new PTE
can actually raise a translation fault on the load instruction because the
translation can be performed speculatively before the page table update and
then marked as "faulting" by the CPU. For user PTEs, this is ok because we
can handle the spurious fault, but for kernel PTEs and intermediate table
entries this results in a panic().
Revert the offending commit to reintroduce the missing barriers.
Cc: <stable@vger.kernel.org>
Fixes: 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2019-08-22 13:58:37 +00:00
|
|
|
if (pte_valid_not_user(pte)) {
|
2014-06-09 10:55:03 +00:00
|
|
|
dsb(ishst);
|
Revert "arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}"
This reverts commit 24fe1b0efad4fcdd32ce46cffeab297f22581707.
Commit 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from
set_{pte,pmd,pud}") removed ISB instructions immediately following updates
to the page table, on the grounds that they are not required by the
architecture and a DSB alone is sufficient to ensure that subsequent data
accesses use the new translation:
DDI0487E_a, B2-128:
| ... no instruction that appears in program order after the DSB
| instruction can alter any state of the system or perform any part of
| its functionality until the DSB completes other than:
|
| * Being fetched from memory and decoded
| * Reading the general-purpose, SIMD and floating-point,
| Special-purpose, or System registers that are directly or indirectly
| read without causing side-effects.
However, the same document also states the following:
DDI0487E_a, B2-125:
| DMB and DSB instructions affect reads and writes to the memory system
| generated by Load/Store instructions and data or unified cache
| maintenance instructions being executed by the PE. Instruction fetches
| or accesses caused by a hardware translation table access are not
| explicit accesses.
which appears to claim that the DSB alone is insufficient. Unfortunately,
some CPU designers have followed the second clause above, whereas in Linux
we've been relying on the first. This means that our mapping sequence:
MOV X0, <valid pte>
STR X0, [Xptep] // Store new PTE to page table
DSB ISHST
LDR X1, [X2] // Translates using the new PTE
can actually raise a translation fault on the load instruction because the
translation can be performed speculatively before the page table update and
then marked as "faulting" by the CPU. For user PTEs, this is ok because we
can handle the spurious fault, but for kernel PTEs and intermediate table
entries this results in a panic().
Revert the offending commit to reintroduce the missing barriers.
Cc: <stable@vger.kernel.org>
Fixes: 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2019-08-22 13:58:37 +00:00
|
|
|
isb();
|
|
|
|
}
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
2018-04-17 12:03:09 +00:00
|
|
|
extern void __sync_icache_dcache(pte_t pteval);
|
2023-01-30 12:14:57 +00:00
|
|
|
bool pgattr_change_is_safe(u64 old, u64 new);
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2015-07-10 16:24:28 +00:00
|
|
|
/*
|
|
|
|
* PTE bits configuration in the presence of hardware Dirty Bit Management
|
|
|
|
* (PTE_WRITE == PTE_DBM):
|
|
|
|
*
|
|
|
|
* Dirty Writable | PTE_RDONLY PTE_WRITE PTE_DIRTY (sw)
|
|
|
|
* 0 0 | 1 0 0
|
|
|
|
* 0 1 | 1 1 0
|
|
|
|
* 1 0 | 1 0 1
|
|
|
|
* 1 1 | 0 1 x
|
|
|
|
*
|
|
|
|
* When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
|
|
|
|
* the page fault mechanism. Checking the dirty status of a pte becomes:
|
|
|
|
*
|
2015-09-11 17:22:00 +00:00
|
|
|
* PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
|
2015-07-10 16:24:28 +00:00
|
|
|
*/
|
2019-06-10 12:41:07 +00:00
|
|
|
|
2023-01-30 12:14:57 +00:00
|
|
|
static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
|
2019-06-10 12:41:07 +00:00
|
|
|
pte_t pte)
|
2012-03-05 11:49:27 +00:00
|
|
|
{
|
2018-02-15 11:14:56 +00:00
|
|
|
pte_t old_pte;
|
|
|
|
|
2019-06-10 12:41:07 +00:00
|
|
|
if (!IS_ENABLED(CONFIG_DEBUG_VM))
|
|
|
|
return;
|
|
|
|
|
|
|
|
old_pte = READ_ONCE(*ptep);
|
|
|
|
|
|
|
|
if (!pte_valid(old_pte) || !pte_valid(pte))
|
|
|
|
return;
|
|
|
|
if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1)
|
|
|
|
return;
|
2013-01-09 11:08:10 +00:00
|
|
|
|
2015-07-10 16:24:28 +00:00
|
|
|
/*
|
2019-06-10 12:41:07 +00:00
|
|
|
* Check for potential race with hardware updates of the pte
|
|
|
|
* (ptep_set_access_flags safely changes valid ptes without going
|
|
|
|
* through an invalid entry).
|
2015-07-10 16:24:28 +00:00
|
|
|
*/
|
2019-06-10 12:41:07 +00:00
|
|
|
VM_WARN_ONCE(!pte_young(pte),
|
|
|
|
"%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
|
|
|
|
__func__, pte_val(old_pte), pte_val(pte));
|
|
|
|
VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
|
|
|
|
"%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
|
|
|
|
__func__, pte_val(old_pte), pte_val(pte));
|
2023-01-30 12:14:57 +00:00
|
|
|
VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
|
|
|
|
"%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
|
|
|
|
__func__, pte_val(old_pte), pte_val(pte));
|
2019-06-10 12:41:07 +00:00
|
|
|
}
|
|
|
|
|
2023-10-05 14:07:30 +00:00
|
|
|
static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
|
2019-06-10 12:41:07 +00:00
|
|
|
{
|
|
|
|
if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
|
|
|
|
__sync_icache_dcache(pte);
|
|
|
|
|
2021-06-21 11:17:11 +00:00
|
|
|
/*
|
|
|
|
* If the PTE would provide user space access to the tags associated
|
|
|
|
* with it then ensure that the MTE tags are synchronised. Although
|
|
|
|
* pte_access_permitted() returns false for exec only mappings, they
|
|
|
|
* don't expose tags (instruction fetches don't check tags).
|
|
|
|
*/
|
|
|
|
if (system_supports_mte() && pte_access_permitted(pte, false) &&
|
2023-05-23 00:43:10 +00:00
|
|
|
!pte_special(pte) && pte_tagged(pte))
|
2023-10-05 14:07:30 +00:00
|
|
|
mte_sync_tags(pte, nr_pages);
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
2023-10-17 10:57:55 +00:00
|
|
|
static inline void set_ptes(struct mm_struct *mm,
|
|
|
|
unsigned long __always_unused addr,
|
|
|
|
pte_t *ptep, pte_t pte, unsigned int nr)
|
2022-05-13 03:23:06 +00:00
|
|
|
{
|
2023-08-02 15:13:38 +00:00
|
|
|
page_table_check_ptes_set(mm, ptep, pte, nr);
|
2023-10-05 14:07:30 +00:00
|
|
|
__sync_cache_and_tags(pte, nr);
|
2023-08-02 15:13:38 +00:00
|
|
|
|
|
|
|
for (;;) {
|
2023-10-05 14:07:30 +00:00
|
|
|
__check_safe_pte_update(mm, ptep, pte);
|
|
|
|
set_pte(ptep, pte);
|
2023-08-02 15:13:38 +00:00
|
|
|
if (--nr == 0)
|
|
|
|
break;
|
|
|
|
ptep++;
|
|
|
|
pte_val(pte) += PAGE_SIZE;
|
|
|
|
}
|
2022-05-13 03:23:06 +00:00
|
|
|
}
|
2023-08-02 15:13:38 +00:00
|
|
|
#define set_ptes set_ptes
|
2022-05-13 03:23:06 +00:00
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Huge pte definitions.
|
|
|
|
*/
|
2013-04-10 12:48:00 +00:00
|
|
|
#define pte_mkhuge(pte) (__pte(pte_val(pte) & ~PTE_TABLE_BIT))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hugetlb definitions.
|
|
|
|
*/
|
2015-12-17 19:31:26 +00:00
|
|
|
#define HUGE_MAX_HSTATE 4
|
2013-04-10 12:48:00 +00:00
|
|
|
#define HPAGE_SHIFT PMD_SHIFT
|
|
|
|
#define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
|
|
|
|
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
|
|
|
|
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2017-12-13 17:07:21 +00:00
|
|
|
static inline pte_t pgd_pte(pgd_t pgd)
|
|
|
|
{
|
|
|
|
return __pte(pgd_val(pgd));
|
|
|
|
}
|
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
static inline pte_t p4d_pte(p4d_t p4d)
|
|
|
|
{
|
|
|
|
return __pte(p4d_val(p4d));
|
|
|
|
}
|
|
|
|
|
2014-10-09 22:29:25 +00:00
|
|
|
static inline pte_t pud_pte(pud_t pud)
|
|
|
|
{
|
|
|
|
return __pte(pud_val(pud));
|
|
|
|
}
|
|
|
|
|
2018-12-11 17:10:39 +00:00
|
|
|
static inline pud_t pte_pud(pte_t pte)
|
|
|
|
{
|
|
|
|
return __pud(pte_val(pte));
|
|
|
|
}
|
|
|
|
|
2014-10-09 22:29:25 +00:00
|
|
|
static inline pmd_t pud_pmd(pud_t pud)
|
|
|
|
{
|
|
|
|
return __pmd(pud_val(pud));
|
|
|
|
}
|
|
|
|
|
2014-02-25 10:02:13 +00:00
|
|
|
static inline pte_t pmd_pte(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return __pte(pmd_val(pmd));
|
|
|
|
}
|
2013-04-19 15:23:57 +00:00
|
|
|
|
2014-02-25 10:02:13 +00:00
|
|
|
static inline pmd_t pte_pmd(pte_t pte)
|
|
|
|
{
|
|
|
|
return __pmd(pte_val(pte));
|
|
|
|
}
|
2013-04-19 15:23:57 +00:00
|
|
|
|
2019-05-27 03:58:15 +00:00
|
|
|
static inline pgprot_t mk_pud_sect_prot(pgprot_t prot)
|
2014-10-20 13:42:07 +00:00
|
|
|
{
|
2019-05-27 03:58:15 +00:00
|
|
|
return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot)
|
2014-10-20 13:42:07 +00:00
|
|
|
{
|
2019-05-27 03:58:15 +00:00
|
|
|
return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT);
|
2014-10-20 13:42:07 +00:00
|
|
|
}
|
|
|
|
|
2022-05-10 01:20:46 +00:00
|
|
|
static inline pte_t pte_swp_mkexclusive(pte_t pte)
|
|
|
|
{
|
|
|
|
return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int pte_swp_exclusive(pte_t pte)
|
|
|
|
{
|
|
|
|
return pte_val(pte) & PTE_SWP_EXCLUSIVE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t pte_swp_clear_exclusive(pte_t pte)
|
|
|
|
{
|
|
|
|
return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
|
|
|
|
}
|
|
|
|
|
2022-06-07 12:50:27 +00:00
|
|
|
/*
|
|
|
|
* Select all bits except the pfn
|
|
|
|
*/
|
|
|
|
static inline pgprot_t pte_pgprot(pte_t pte)
|
|
|
|
{
|
|
|
|
unsigned long pfn = pte_pfn(pte);
|
|
|
|
|
|
|
|
return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
|
|
|
|
}
|
|
|
|
|
2016-04-08 22:50:28 +00:00
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
/*
|
2020-06-09 04:32:38 +00:00
|
|
|
* See the comment in include/linux/pgtable.h
|
2016-04-08 22:50:28 +00:00
|
|
|
*/
|
|
|
|
static inline int pte_protnone(pte_t pte)
|
|
|
|
{
|
|
|
|
return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int pmd_protnone(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return pte_protnone(pmd_pte(pmd));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-09-09 04:53:02 +00:00
|
|
|
#define pmd_present_invalid(pmd) (!!(pmd_val(pmd) & PMD_PRESENT_INVALID))
|
|
|
|
|
|
|
|
static inline int pmd_present(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd);
|
|
|
|
}
|
|
|
|
|
2013-04-19 15:23:57 +00:00
|
|
|
/*
|
|
|
|
* THP definitions.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
2020-09-09 04:53:02 +00:00
|
|
|
static inline int pmd_trans_huge(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
|
|
|
|
}
|
2014-10-09 22:29:25 +00:00
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
2013-04-19 15:23:57 +00:00
|
|
|
|
2014-12-10 23:44:36 +00:00
|
|
|
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
|
2014-02-25 10:02:13 +00:00
|
|
|
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
|
2018-08-22 20:36:31 +00:00
|
|
|
#define pmd_valid(pmd) pte_valid(pmd_pte(pmd))
|
2022-05-13 03:23:06 +00:00
|
|
|
#define pmd_user(pmd) pte_user(pmd_pte(pmd))
|
|
|
|
#define pmd_user_exec(pmd) pte_user_exec(pmd_pte(pmd))
|
2020-11-13 10:46:06 +00:00
|
|
|
#define pmd_cont(pmd) pte_cont(pmd_pte(pmd))
|
2014-02-25 10:02:13 +00:00
|
|
|
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
|
|
|
|
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
|
2023-06-13 00:10:27 +00:00
|
|
|
#define pmd_mkwrite_novma(pmd) pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)))
|
2016-05-05 09:44:01 +00:00
|
|
|
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
|
2014-02-25 10:02:13 +00:00
|
|
|
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
|
|
|
|
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
|
2020-09-09 04:53:02 +00:00
|
|
|
|
|
|
|
static inline pmd_t pmd_mkinvalid(pmd_t pmd)
|
|
|
|
{
|
|
|
|
pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID));
|
|
|
|
pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID));
|
|
|
|
|
|
|
|
return pmd;
|
|
|
|
}
|
2013-04-19 15:23:57 +00:00
|
|
|
|
2016-03-15 10:46:34 +00:00
|
|
|
#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
|
|
|
|
|
2014-02-25 10:02:13 +00:00
|
|
|
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
|
2013-04-19 15:23:57 +00:00
|
|
|
|
|
|
|
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
|
|
|
|
|
2019-07-16 23:30:51 +00:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
#define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd))
|
|
|
|
#endif
|
arm64: mm: add missing PTE_SPECIAL in pte_mkdevmap on arm64
Without this patch, the MAP_SYNC test case will cause a print_bad_pte
warning on arm64 as follows:
[ 25.542693] BUG: Bad page map in process mapdax333 pte:2e8000448800f53 pmd:41ff5f003
[ 25.546360] page:ffff7e0010220000 refcount:1 mapcount:-1 mapping:ffff8003e29c7440 index:0x0
[ 25.550281] ext4_dax_aops
[ 25.550282] name:"__aaabbbcccddd__"
[ 25.551553] flags: 0x3ffff0000001002(referenced|reserved)
[ 25.555802] raw: 03ffff0000001002 ffff8003dfffa908 0000000000000000 ffff8003e29c7440
[ 25.559446] raw: 0000000000000000 0000000000000000 00000001fffffffe 0000000000000000
[ 25.563075] page dumped because: bad pte
[ 25.564938] addr:0000ffffbe05b000 vm_flags:208000fb anon_vma:0000000000000000 mapping:ffff8003e29c7440 index:0
[ 25.574272] file:__aaabbbcccddd__ fault:ext4_dax_fault mmmmap:ext4_file_mmap readpage:0x0
[ 25.578799] CPU: 1 PID: 1180 Comm: mapdax333 Not tainted 5.2.0+ #21
[ 25.581702] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
[ 25.585624] Call trace:
[ 25.587008] dump_backtrace+0x0/0x178
[ 25.588799] show_stack+0x24/0x30
[ 25.590328] dump_stack+0xa8/0xcc
[ 25.591901] print_bad_pte+0x18c/0x218
[ 25.593628] unmap_page_range+0x778/0xc00
[ 25.595506] unmap_single_vma+0x94/0xe8
[ 25.597304] unmap_vmas+0x90/0x108
[ 25.598901] unmap_region+0xc0/0x128
[ 25.600566] __do_munmap+0x284/0x3f0
[ 25.602245] __vm_munmap+0x78/0xe0
[ 25.603820] __arm64_sys_munmap+0x34/0x48
[ 25.605709] el0_svc_common.constprop.0+0x78/0x168
[ 25.607956] el0_svc_handler+0x34/0x90
[ 25.609698] el0_svc+0x8/0xc
[...]
The root cause is in _vm_normal_page, without the PTE_SPECIAL bit,
the return value will be incorrectly set to pfn_to_page(pfn) instead
of NULL. Besides, this patch also rewrite the pmd_mkdevmap to avoid
setting PTE_SPECIAL for pmd
The MAP_SYNC test case is as follows(Provided by Yibo Cai)
$#include <stdio.h>
$#include <string.h>
$#include <unistd.h>
$#include <sys/file.h>
$#include <sys/mman.h>
$#ifndef MAP_SYNC
$#define MAP_SYNC 0x80000
$#endif
/* mount -o dax /dev/pmem0 /mnt */
$#define F "/mnt/__aaabbbcccddd__"
int main(void)
{
int fd;
char buf[4096];
void *addr;
if ((fd = open(F, O_CREAT|O_TRUNC|O_RDWR, 0644)) < 0) {
perror("open1");
return 1;
}
if (write(fd, buf, 4096) != 4096) {
perror("lseek");
return 1;
}
addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_SYNC, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap");
printf("did you mount with '-o dax'?\n");
return 1;
}
memset(addr, 0x55, 4096);
if (munmap(addr, 4096) == -1) {
perror("munmap");
return 1;
}
close(fd);
return 0;
}
Fixes: 73b20c84d42d ("arm64: mm: implement pte_devmap support")
Reported-by: Yibo Cai <Yibo.Cai@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Robin Murphy <Robin.Murphy@arm.com>
Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2019-08-07 04:58:51 +00:00
|
|
|
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
|
|
|
|
}
|
2019-07-16 23:30:51 +00:00
|
|
|
|
2017-12-13 17:07:21 +00:00
|
|
|
#define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd))
|
|
|
|
#define __phys_to_pmd_val(phys) __phys_to_pte_val(phys)
|
|
|
|
#define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
|
|
|
|
#define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
|
2013-04-19 15:23:57 +00:00
|
|
|
#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
|
|
|
|
|
2018-12-11 17:10:40 +00:00
|
|
|
#define pud_young(pud) pte_young(pud_pte(pud))
|
2018-12-11 17:10:39 +00:00
|
|
|
#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
|
2014-10-09 22:29:25 +00:00
|
|
|
#define pud_write(pud) pte_write(pud_pte(pud))
|
2017-12-13 17:07:21 +00:00
|
|
|
|
2018-12-11 17:10:41 +00:00
|
|
|
#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT))
|
|
|
|
|
2017-12-13 17:07:21 +00:00
|
|
|
#define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud))
|
|
|
|
#define __phys_to_pud_val(phys) __phys_to_pte_val(phys)
|
|
|
|
#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
|
|
|
|
#define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
|
2013-04-19 15:23:57 +00:00
|
|
|
|
2023-10-17 10:57:55 +00:00
|
|
|
static inline void __set_pte_at(struct mm_struct *mm,
|
|
|
|
unsigned long __always_unused addr,
|
2023-10-05 14:07:30 +00:00
|
|
|
pte_t *ptep, pte_t pte, unsigned int nr)
|
|
|
|
{
|
|
|
|
__sync_cache_and_tags(pte, nr);
|
|
|
|
__check_safe_pte_update(mm, ptep, pte);
|
|
|
|
set_pte(ptep, pte);
|
|
|
|
}
|
|
|
|
|
2022-05-13 03:23:06 +00:00
|
|
|
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pmd_t *pmdp, pmd_t pmd)
|
|
|
|
{
|
2023-07-13 17:26:35 +00:00
|
|
|
page_table_check_pmd_set(mm, pmdp, pmd);
|
2023-10-05 14:07:30 +00:00
|
|
|
return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd),
|
|
|
|
PMD_SIZE >> PAGE_SHIFT);
|
2022-05-13 03:23:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pud_t *pudp, pud_t pud)
|
|
|
|
{
|
2023-07-13 17:26:36 +00:00
|
|
|
page_table_check_pud_set(mm, pudp, pud);
|
2023-10-05 14:07:30 +00:00
|
|
|
return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud),
|
|
|
|
PUD_SIZE >> PAGE_SHIFT);
|
2022-05-13 03:23:06 +00:00
|
|
|
}
|
2013-04-19 15:23:57 +00:00
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d))
|
|
|
|
#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys)
|
|
|
|
|
2017-12-13 17:07:21 +00:00
|
|
|
#define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd))
|
|
|
|
#define __phys_to_pgd_val(phys) __phys_to_pte_val(phys)
|
|
|
|
|
2014-04-03 14:57:15 +00:00
|
|
|
#define __pgprot_modify(prot,mask,bits) \
|
|
|
|
__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
|
|
|
|
|
2020-06-02 04:51:32 +00:00
|
|
|
#define pgprot_nx(prot) \
|
2020-06-15 15:27:43 +00:00
|
|
|
__pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN)
|
2020-06-02 04:51:32 +00:00
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Mark the prot value as uncacheable and unbufferable.
|
|
|
|
*/
|
|
|
|
#define pgprot_noncached(prot) \
|
2014-03-12 16:07:06 +00:00
|
|
|
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN)
|
2012-03-05 11:49:27 +00:00
|
|
|
#define pgprot_writecombine(prot) \
|
2014-03-12 16:07:06 +00:00
|
|
|
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
|
2014-09-29 14:29:31 +00:00
|
|
|
#define pgprot_device(prot) \
|
|
|
|
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN)
|
2021-03-09 12:26:01 +00:00
|
|
|
#define pgprot_tagged(prot) \
|
|
|
|
__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED))
|
|
|
|
#define pgprot_mhp pgprot_tagged
|
2019-08-03 09:38:31 +00:00
|
|
|
/*
|
|
|
|
* DMA allocations for non-coherent devices use what the Arm architecture calls
|
|
|
|
* "Normal non-cacheable" memory, which permits speculation, unaligned accesses
|
|
|
|
* and merging of writes. This is different from "Device-nGnR[nE]" memory which
|
|
|
|
* is intended for MMIO and thus forbids speculation, preserves access size,
|
|
|
|
* requires strict alignment and can also force write responses to come from the
|
|
|
|
* endpoint.
|
|
|
|
*/
|
2019-08-26 07:03:44 +00:00
|
|
|
#define pgprot_dmacoherent(prot) \
|
|
|
|
__pgprot_modify(prot, PTE_ATTRINDX_MASK, \
|
|
|
|
PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
#define __HAVE_PHYS_MEM_ACCESS_PROT
|
|
|
|
struct file;
|
|
|
|
extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
|
|
|
|
unsigned long size, pgprot_t vma_prot);
|
|
|
|
|
|
|
|
#define pmd_none(pmd) (!pmd_val(pmd))
|
|
|
|
|
2012-12-07 18:35:41 +00:00
|
|
|
#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
|
|
|
|
PMD_TYPE_TABLE)
|
|
|
|
#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
|
|
|
|
PMD_TYPE_SECT)
|
2022-04-22 06:00:33 +00:00
|
|
|
#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd))
|
2021-05-10 11:07:51 +00:00
|
|
|
#define pmd_bad(pmd) (!pmd_table(pmd))
|
2012-12-07 18:35:41 +00:00
|
|
|
|
2020-11-13 10:46:06 +00:00
|
|
|
#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
|
|
|
|
#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
|
|
|
|
|
2016-02-25 15:53:44 +00:00
|
|
|
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
|
2019-07-31 20:05:45 +00:00
|
|
|
static inline bool pud_sect(pud_t pud) { return false; }
|
|
|
|
static inline bool pud_table(pud_t pud) { return true; }
|
2014-05-06 13:02:27 +00:00
|
|
|
#else
|
|
|
|
#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
|
|
|
|
PUD_TYPE_SECT)
|
2014-12-09 07:26:47 +00:00
|
|
|
#define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
|
|
|
|
PUD_TYPE_TABLE)
|
2014-05-06 13:02:27 +00:00
|
|
|
#endif
|
2012-12-07 18:35:41 +00:00
|
|
|
|
2024-02-14 12:29:20 +00:00
|
|
|
extern pgd_t init_pg_dir[];
|
2018-09-24 16:15:02 +00:00
|
|
|
extern pgd_t init_pg_end[];
|
2024-02-14 12:29:20 +00:00
|
|
|
extern pgd_t swapper_pg_dir[];
|
|
|
|
extern pgd_t idmap_pg_dir[];
|
|
|
|
extern pgd_t tramp_pg_dir[];
|
|
|
|
extern pgd_t reserved_pg_dir[];
|
2018-09-24 16:15:02 +00:00
|
|
|
|
|
|
|
extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd);
|
|
|
|
|
|
|
|
static inline bool in_swapper_pgdir(void *addr)
|
|
|
|
{
|
|
|
|
return ((unsigned long)addr & PAGE_MASK) ==
|
|
|
|
((unsigned long)swapper_pg_dir & PAGE_MASK);
|
|
|
|
}
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
|
|
|
|
{
|
2018-10-05 13:49:16 +00:00
|
|
|
#ifdef __PAGETABLE_PMD_FOLDED
|
|
|
|
if (in_swapper_pgdir(pmdp)) {
|
2018-09-24 16:15:02 +00:00
|
|
|
set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd)));
|
|
|
|
return;
|
|
|
|
}
|
2018-10-05 13:49:16 +00:00
|
|
|
#endif /* __PAGETABLE_PMD_FOLDED */
|
2018-09-24 16:15:02 +00:00
|
|
|
|
2018-02-15 11:14:56 +00:00
|
|
|
WRITE_ONCE(*pmdp, pmd);
|
2018-08-22 20:36:31 +00:00
|
|
|
|
Revert "arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}"
This reverts commit 24fe1b0efad4fcdd32ce46cffeab297f22581707.
Commit 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from
set_{pte,pmd,pud}") removed ISB instructions immediately following updates
to the page table, on the grounds that they are not required by the
architecture and a DSB alone is sufficient to ensure that subsequent data
accesses use the new translation:
DDI0487E_a, B2-128:
| ... no instruction that appears in program order after the DSB
| instruction can alter any state of the system or perform any part of
| its functionality until the DSB completes other than:
|
| * Being fetched from memory and decoded
| * Reading the general-purpose, SIMD and floating-point,
| Special-purpose, or System registers that are directly or indirectly
| read without causing side-effects.
However, the same document also states the following:
DDI0487E_a, B2-125:
| DMB and DSB instructions affect reads and writes to the memory system
| generated by Load/Store instructions and data or unified cache
| maintenance instructions being executed by the PE. Instruction fetches
| or accesses caused by a hardware translation table access are not
| explicit accesses.
which appears to claim that the DSB alone is insufficient. Unfortunately,
some CPU designers have followed the second clause above, whereas in Linux
we've been relying on the first. This means that our mapping sequence:
MOV X0, <valid pte>
STR X0, [Xptep] // Store new PTE to page table
DSB ISHST
LDR X1, [X2] // Translates using the new PTE
can actually raise a translation fault on the load instruction because the
translation can be performed speculatively before the page table update and
then marked as "faulting" by the CPU. For user PTEs, this is ok because we
can handle the spurious fault, but for kernel PTEs and intermediate table
entries this results in a panic().
Revert the offending commit to reintroduce the missing barriers.
Cc: <stable@vger.kernel.org>
Fixes: 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2019-08-22 13:58:37 +00:00
|
|
|
if (pmd_valid(pmd)) {
|
2018-08-22 20:36:31 +00:00
|
|
|
dsb(ishst);
|
Revert "arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}"
This reverts commit 24fe1b0efad4fcdd32ce46cffeab297f22581707.
Commit 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from
set_{pte,pmd,pud}") removed ISB instructions immediately following updates
to the page table, on the grounds that they are not required by the
architecture and a DSB alone is sufficient to ensure that subsequent data
accesses use the new translation:
DDI0487E_a, B2-128:
| ... no instruction that appears in program order after the DSB
| instruction can alter any state of the system or perform any part of
| its functionality until the DSB completes other than:
|
| * Being fetched from memory and decoded
| * Reading the general-purpose, SIMD and floating-point,
| Special-purpose, or System registers that are directly or indirectly
| read without causing side-effects.
However, the same document also states the following:
DDI0487E_a, B2-125:
| DMB and DSB instructions affect reads and writes to the memory system
| generated by Load/Store instructions and data or unified cache
| maintenance instructions being executed by the PE. Instruction fetches
| or accesses caused by a hardware translation table access are not
| explicit accesses.
which appears to claim that the DSB alone is insufficient. Unfortunately,
some CPU designers have followed the second clause above, whereas in Linux
we've been relying on the first. This means that our mapping sequence:
MOV X0, <valid pte>
STR X0, [Xptep] // Store new PTE to page table
DSB ISHST
LDR X1, [X2] // Translates using the new PTE
can actually raise a translation fault on the load instruction because the
translation can be performed speculatively before the page table update and
then marked as "faulting" by the CPU. For user PTEs, this is ok because we
can handle the spurious fault, but for kernel PTEs and intermediate table
entries this results in a panic().
Revert the offending commit to reintroduce the missing barriers.
Cc: <stable@vger.kernel.org>
Fixes: 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2019-08-22 13:58:37 +00:00
|
|
|
isb();
|
|
|
|
}
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pmd_clear(pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
set_pmd(pmdp, __pmd(0));
|
|
|
|
}
|
|
|
|
|
2016-01-25 11:45:04 +00:00
|
|
|
static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
|
2012-03-05 11:49:27 +00:00
|
|
|
{
|
2017-12-13 17:07:21 +00:00
|
|
|
return __pmd_to_phys(pmd);
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
2020-06-09 04:33:10 +00:00
|
|
|
static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return (unsigned long)__va(pmd_page_paddr(pmd));
|
|
|
|
}
|
2019-04-29 17:37:01 +00:00
|
|
|
|
2016-01-25 11:45:03 +00:00
|
|
|
/* Find an entry in the third-level page table. */
|
2017-09-29 10:29:55 +00:00
|
|
|
#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t))
|
2016-01-25 11:45:03 +00:00
|
|
|
|
2016-01-25 11:45:07 +00:00
|
|
|
#define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr))
|
|
|
|
#define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr))
|
|
|
|
#define pte_clear_fixmap() clear_fixmap(FIX_PTE)
|
|
|
|
|
2020-04-27 23:46:55 +00:00
|
|
|
#define pmd_page(pmd) phys_to_page(__pmd_to_phys(pmd))
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2016-02-16 12:52:37 +00:00
|
|
|
/* use ONLY for statically allocated translation tables */
|
|
|
|
#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Conversion functions: convert a page and protection to a page entry,
|
|
|
|
* and a page entry and page directory to the page they refer to.
|
|
|
|
*/
|
|
|
|
#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot)
|
|
|
|
|
2015-04-14 22:45:39 +00:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 2
|
2012-03-05 11:49:27 +00:00
|
|
|
|
arm64/mm: Refactor {pgd, pud, pmd, pte}_ERROR()
The function __{pgd, pud, pmd, pte}_error() are introduced so that
they can be called by {pgd, pud, pmd, pte}_ERROR(). However, some
of the functions could never be called when the corresponding page
table level isn't enabled. For example, __{pud, pmd}_error() are
unused when PUD and PMD are folded to PGD.
This removes __{pgd, pud, pmd, pte}_error() and call pr_err() from
{pgd, pud, pmd, pte}_ERROR() directly, similar to what x86/powerpc
are doing. With this, the code looks a bit simplified either.
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20200913234730.23145-1-gshan@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
2020-09-13 23:47:30 +00:00
|
|
|
#define pmd_ERROR(e) \
|
|
|
|
pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
#define pud_none(pud) (!pud_val(pud))
|
2021-05-10 11:07:51 +00:00
|
|
|
#define pud_bad(pud) (!pud_table(pud))
|
2017-06-08 17:25:26 +00:00
|
|
|
#define pud_present(pud) pte_present(pud_pte(pud))
|
2022-04-22 06:00:33 +00:00
|
|
|
#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
|
2018-08-22 20:36:31 +00:00
|
|
|
#define pud_valid(pud) pte_valid(pud_pte(pud))
|
2022-05-13 03:23:06 +00:00
|
|
|
#define pud_user(pud) pte_user(pud_pte(pud))
|
2022-11-22 12:31:37 +00:00
|
|
|
#define pud_user_exec(pud) pte_user_exec(pud_pte(pud))
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2024-02-16 23:59:44 +00:00
|
|
|
static inline bool pgtable_l4_enabled(void);
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
static inline void set_pud(pud_t *pudp, pud_t pud)
|
|
|
|
{
|
2024-02-16 23:59:44 +00:00
|
|
|
if (!pgtable_l4_enabled() && in_swapper_pgdir(pudp)) {
|
2018-09-24 16:15:02 +00:00
|
|
|
set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud)));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-02-15 11:14:56 +00:00
|
|
|
WRITE_ONCE(*pudp, pud);
|
2018-08-22 20:36:31 +00:00
|
|
|
|
Revert "arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}"
This reverts commit 24fe1b0efad4fcdd32ce46cffeab297f22581707.
Commit 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from
set_{pte,pmd,pud}") removed ISB instructions immediately following updates
to the page table, on the grounds that they are not required by the
architecture and a DSB alone is sufficient to ensure that subsequent data
accesses use the new translation:
DDI0487E_a, B2-128:
| ... no instruction that appears in program order after the DSB
| instruction can alter any state of the system or perform any part of
| its functionality until the DSB completes other than:
|
| * Being fetched from memory and decoded
| * Reading the general-purpose, SIMD and floating-point,
| Special-purpose, or System registers that are directly or indirectly
| read without causing side-effects.
However, the same document also states the following:
DDI0487E_a, B2-125:
| DMB and DSB instructions affect reads and writes to the memory system
| generated by Load/Store instructions and data or unified cache
| maintenance instructions being executed by the PE. Instruction fetches
| or accesses caused by a hardware translation table access are not
| explicit accesses.
which appears to claim that the DSB alone is insufficient. Unfortunately,
some CPU designers have followed the second clause above, whereas in Linux
we've been relying on the first. This means that our mapping sequence:
MOV X0, <valid pte>
STR X0, [Xptep] // Store new PTE to page table
DSB ISHST
LDR X1, [X2] // Translates using the new PTE
can actually raise a translation fault on the load instruction because the
translation can be performed speculatively before the page table update and
then marked as "faulting" by the CPU. For user PTEs, this is ok because we
can handle the spurious fault, but for kernel PTEs and intermediate table
entries this results in a panic().
Revert the offending commit to reintroduce the missing barriers.
Cc: <stable@vger.kernel.org>
Fixes: 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2019-08-22 13:58:37 +00:00
|
|
|
if (pud_valid(pud)) {
|
2018-08-22 20:36:31 +00:00
|
|
|
dsb(ishst);
|
Revert "arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}"
This reverts commit 24fe1b0efad4fcdd32ce46cffeab297f22581707.
Commit 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from
set_{pte,pmd,pud}") removed ISB instructions immediately following updates
to the page table, on the grounds that they are not required by the
architecture and a DSB alone is sufficient to ensure that subsequent data
accesses use the new translation:
DDI0487E_a, B2-128:
| ... no instruction that appears in program order after the DSB
| instruction can alter any state of the system or perform any part of
| its functionality until the DSB completes other than:
|
| * Being fetched from memory and decoded
| * Reading the general-purpose, SIMD and floating-point,
| Special-purpose, or System registers that are directly or indirectly
| read without causing side-effects.
However, the same document also states the following:
DDI0487E_a, B2-125:
| DMB and DSB instructions affect reads and writes to the memory system
| generated by Load/Store instructions and data or unified cache
| maintenance instructions being executed by the PE. Instruction fetches
| or accesses caused by a hardware translation table access are not
| explicit accesses.
which appears to claim that the DSB alone is insufficient. Unfortunately,
some CPU designers have followed the second clause above, whereas in Linux
we've been relying on the first. This means that our mapping sequence:
MOV X0, <valid pte>
STR X0, [Xptep] // Store new PTE to page table
DSB ISHST
LDR X1, [X2] // Translates using the new PTE
can actually raise a translation fault on the load instruction because the
translation can be performed speculatively before the page table update and
then marked as "faulting" by the CPU. For user PTEs, this is ok because we
can handle the spurious fault, but for kernel PTEs and intermediate table
entries this results in a panic().
Revert the offending commit to reintroduce the missing barriers.
Cc: <stable@vger.kernel.org>
Fixes: 24fe1b0efad4fcdd ("arm64: Remove unnecessary ISBs from set_{pte,pmd,pud}")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2019-08-22 13:58:37 +00:00
|
|
|
isb();
|
|
|
|
}
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pud_clear(pud_t *pudp)
|
|
|
|
{
|
|
|
|
set_pud(pudp, __pud(0));
|
|
|
|
}
|
|
|
|
|
2016-01-25 11:45:04 +00:00
|
|
|
static inline phys_addr_t pud_page_paddr(pud_t pud)
|
2012-03-05 11:49:27 +00:00
|
|
|
{
|
2017-12-13 17:07:21 +00:00
|
|
|
return __pud_to_phys(pud);
|
2012-03-05 11:49:27 +00:00
|
|
|
}
|
|
|
|
|
2021-07-08 01:09:53 +00:00
|
|
|
static inline pmd_t *pud_pgtable(pud_t pud)
|
2020-06-09 04:33:10 +00:00
|
|
|
{
|
2021-07-08 01:09:53 +00:00
|
|
|
return (pmd_t *)__va(pud_page_paddr(pud));
|
2020-06-09 04:33:10 +00:00
|
|
|
}
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2020-06-09 04:33:10 +00:00
|
|
|
/* Find an entry in the second-level page table. */
|
2018-02-15 11:14:56 +00:00
|
|
|
#define pmd_offset_phys(dir, addr) (pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t))
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2016-01-25 11:45:07 +00:00
|
|
|
#define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
|
|
|
|
#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr))
|
|
|
|
#define pmd_clear_fixmap() clear_fixmap(FIX_PMD)
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2020-04-27 23:46:55 +00:00
|
|
|
#define pud_page(pud) phys_to_page(__pud_to_phys(pud))
|
2014-10-09 22:29:25 +00:00
|
|
|
|
2016-02-16 12:52:37 +00:00
|
|
|
/* use ONLY for statically allocated translation tables */
|
|
|
|
#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
|
|
|
|
|
2016-01-25 11:45:04 +00:00
|
|
|
#else
|
|
|
|
|
|
|
|
#define pud_page_paddr(pud) ({ BUILD_BUG(); 0; })
|
2023-01-09 15:47:25 +00:00
|
|
|
#define pud_user_exec(pud) pud_user(pud) /* Always 0 with folding */
|
2016-01-25 11:45:04 +00:00
|
|
|
|
2016-01-25 11:45:07 +00:00
|
|
|
/* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
|
|
|
|
#define pmd_set_fixmap(addr) NULL
|
|
|
|
#define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp)
|
|
|
|
#define pmd_clear_fixmap()
|
|
|
|
|
2016-02-16 12:52:37 +00:00
|
|
|
#define pmd_offset_kimg(dir,addr) ((pmd_t *)dir)
|
|
|
|
|
2015-04-14 22:45:39 +00:00
|
|
|
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
|
2012-03-05 11:49:27 +00:00
|
|
|
|
2015-04-14 22:45:39 +00:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 3
|
2014-05-12 09:40:51 +00:00
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
static __always_inline bool pgtable_l4_enabled(void)
|
|
|
|
{
|
|
|
|
if (CONFIG_PGTABLE_LEVELS > 4 || !IS_ENABLED(CONFIG_ARM64_LPA2))
|
|
|
|
return true;
|
|
|
|
if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT))
|
|
|
|
return vabits_actual == VA_BITS;
|
|
|
|
return alternative_has_cap_unlikely(ARM64_HAS_VA52);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool mm_pud_folded(const struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return !pgtable_l4_enabled();
|
|
|
|
}
|
|
|
|
#define mm_pud_folded mm_pud_folded
|
|
|
|
|
arm64/mm: Refactor {pgd, pud, pmd, pte}_ERROR()
The function __{pgd, pud, pmd, pte}_error() are introduced so that
they can be called by {pgd, pud, pmd, pte}_ERROR(). However, some
of the functions could never be called when the corresponding page
table level isn't enabled. For example, __{pud, pmd}_error() are
unused when PUD and PMD are folded to PGD.
This removes __{pgd, pud, pmd, pte}_error() and call pr_err() from
{pgd, pud, pmd, pte}_ERROR() directly, similar to what x86/powerpc
are doing. With this, the code looks a bit simplified either.
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20200913234730.23145-1-gshan@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
2020-09-13 23:47:30 +00:00
|
|
|
#define pud_ERROR(e) \
|
|
|
|
pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e))
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
#define p4d_none(p4d) (pgtable_l4_enabled() && !p4d_val(p4d))
|
|
|
|
#define p4d_bad(p4d) (pgtable_l4_enabled() && !(p4d_val(p4d) & 2))
|
|
|
|
#define p4d_present(p4d) (!p4d_none(p4d))
|
2014-05-12 09:40:51 +00:00
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
|
2014-05-12 09:40:51 +00:00
|
|
|
{
|
2020-06-04 23:46:23 +00:00
|
|
|
if (in_swapper_pgdir(p4dp)) {
|
|
|
|
set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
|
2018-09-24 16:15:02 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
WRITE_ONCE(*p4dp, p4d);
|
2014-05-12 09:40:51 +00:00
|
|
|
dsb(ishst);
|
2019-08-23 12:03:55 +00:00
|
|
|
isb();
|
2014-05-12 09:40:51 +00:00
|
|
|
}
|
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
static inline void p4d_clear(p4d_t *p4dp)
|
2014-05-12 09:40:51 +00:00
|
|
|
{
|
2024-02-14 12:29:22 +00:00
|
|
|
if (pgtable_l4_enabled())
|
|
|
|
set_p4d(p4dp, __p4d(0));
|
2014-05-12 09:40:51 +00:00
|
|
|
}
|
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
|
2014-05-12 09:40:51 +00:00
|
|
|
{
|
2020-06-04 23:46:23 +00:00
|
|
|
return __p4d_to_phys(p4d);
|
2014-05-12 09:40:51 +00:00
|
|
|
}
|
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
|
|
|
|
|
|
|
|
static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr)
|
|
|
|
{
|
|
|
|
return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr);
|
|
|
|
}
|
|
|
|
|
2021-07-08 01:09:56 +00:00
|
|
|
static inline pud_t *p4d_pgtable(p4d_t p4d)
|
2020-06-09 04:33:10 +00:00
|
|
|
{
|
2021-07-08 01:09:56 +00:00
|
|
|
return (pud_t *)__va(p4d_page_paddr(p4d));
|
2020-06-09 04:33:10 +00:00
|
|
|
}
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
static inline phys_addr_t pud_offset_phys(p4d_t *p4dp, unsigned long addr)
|
|
|
|
{
|
|
|
|
BUG_ON(!pgtable_l4_enabled());
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
return p4d_page_paddr(READ_ONCE(*p4dp)) + pud_index(addr) * sizeof(pud_t);
|
|
|
|
}
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
static inline
|
|
|
|
pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l4_enabled())
|
|
|
|
return p4d_to_folded_pud(p4dp, addr);
|
|
|
|
return (pud_t *)__va(p4d_page_paddr(p4d)) + pud_index(addr);
|
|
|
|
}
|
|
|
|
#define pud_offset_lockless pud_offset_lockless
|
|
|
|
|
|
|
|
static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long addr)
|
|
|
|
{
|
|
|
|
return pud_offset_lockless(p4dp, READ_ONCE(*p4dp), addr);
|
|
|
|
}
|
|
|
|
#define pud_offset pud_offset
|
|
|
|
|
|
|
|
static inline pud_t *pud_set_fixmap(unsigned long addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l4_enabled())
|
|
|
|
return NULL;
|
|
|
|
return (pud_t *)set_fixmap_offset(FIX_PUD, addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pud_t *pud_set_fixmap_offset(p4d_t *p4dp, unsigned long addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l4_enabled())
|
|
|
|
return p4d_to_folded_pud(p4dp, addr);
|
|
|
|
return pud_set_fixmap(pud_offset_phys(p4dp, addr));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pud_clear_fixmap(void)
|
|
|
|
{
|
|
|
|
if (pgtable_l4_enabled())
|
|
|
|
clear_fixmap(FIX_PUD);
|
|
|
|
}
|
2014-12-20 00:49:40 +00:00
|
|
|
|
2016-02-16 12:52:37 +00:00
|
|
|
/* use ONLY for statically allocated translation tables */
|
2024-02-14 12:29:22 +00:00
|
|
|
static inline pud_t *pud_offset_kimg(p4d_t *p4dp, u64 addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l4_enabled())
|
|
|
|
return p4d_to_folded_pud(p4dp, addr);
|
|
|
|
return (pud_t *)__phys_to_kimg(pud_offset_phys(p4dp, addr));
|
|
|
|
}
|
|
|
|
|
|
|
|
#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d)))
|
2016-02-16 12:52:37 +00:00
|
|
|
|
2016-01-25 11:45:04 +00:00
|
|
|
#else
|
|
|
|
|
2024-02-14 12:29:22 +00:00
|
|
|
static inline bool pgtable_l4_enabled(void) { return false; }
|
|
|
|
|
2020-06-04 23:46:23 +00:00
|
|
|
#define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;})
|
2016-01-25 11:45:04 +00:00
|
|
|
|
2016-01-25 11:45:07 +00:00
|
|
|
/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
|
|
|
|
#define pud_set_fixmap(addr) NULL
|
|
|
|
#define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp)
|
|
|
|
#define pud_clear_fixmap()
|
|
|
|
|
2016-02-16 12:52:37 +00:00
|
|
|
#define pud_offset_kimg(dir,addr) ((pud_t *)dir)
|
|
|
|
|
2015-04-14 22:45:39 +00:00
|
|
|
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
|
2014-05-12 09:40:51 +00:00
|
|
|
|
2024-02-14 12:29:17 +00:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 4
|
|
|
|
|
|
|
|
static __always_inline bool pgtable_l5_enabled(void)
|
|
|
|
{
|
|
|
|
if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT))
|
|
|
|
return vabits_actual == VA_BITS;
|
|
|
|
return alternative_has_cap_unlikely(ARM64_HAS_VA52);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool mm_p4d_folded(const struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return !pgtable_l5_enabled();
|
|
|
|
}
|
|
|
|
#define mm_p4d_folded mm_p4d_folded
|
|
|
|
|
|
|
|
#define p4d_ERROR(e) \
|
|
|
|
pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e))
|
|
|
|
|
|
|
|
#define pgd_none(pgd) (pgtable_l5_enabled() && !pgd_val(pgd))
|
|
|
|
#define pgd_bad(pgd) (pgtable_l5_enabled() && !(pgd_val(pgd) & 2))
|
|
|
|
#define pgd_present(pgd) (!pgd_none(pgd))
|
|
|
|
|
|
|
|
static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
|
|
|
|
{
|
|
|
|
if (in_swapper_pgdir(pgdp)) {
|
|
|
|
set_swapper_pgd(pgdp, __pgd(pgd_val(pgd)));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
WRITE_ONCE(*pgdp, pgd);
|
|
|
|
dsb(ishst);
|
|
|
|
isb();
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pgd_clear(pgd_t *pgdp)
|
|
|
|
{
|
|
|
|
if (pgtable_l5_enabled())
|
|
|
|
set_pgd(pgdp, __pgd(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
|
|
|
|
{
|
|
|
|
return __pgd_to_phys(pgd);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))
|
|
|
|
|
|
|
|
static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr)
|
|
|
|
{
|
|
|
|
return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline phys_addr_t p4d_offset_phys(pgd_t *pgdp, unsigned long addr)
|
|
|
|
{
|
|
|
|
BUG_ON(!pgtable_l5_enabled());
|
|
|
|
|
|
|
|
return pgd_page_paddr(READ_ONCE(*pgdp)) + p4d_index(addr) * sizeof(p4d_t);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline
|
|
|
|
p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l5_enabled())
|
|
|
|
return pgd_to_folded_p4d(pgdp, addr);
|
|
|
|
return (p4d_t *)__va(pgd_page_paddr(pgd)) + p4d_index(addr);
|
|
|
|
}
|
|
|
|
#define p4d_offset_lockless p4d_offset_lockless
|
|
|
|
|
|
|
|
static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr)
|
|
|
|
{
|
|
|
|
return p4d_offset_lockless(pgdp, READ_ONCE(*pgdp), addr);
|
|
|
|
}
|
|
|
|
|
2024-02-14 12:29:20 +00:00
|
|
|
static inline p4d_t *p4d_set_fixmap(unsigned long addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l5_enabled())
|
|
|
|
return NULL;
|
|
|
|
return (p4d_t *)set_fixmap_offset(FIX_P4D, addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline p4d_t *p4d_set_fixmap_offset(pgd_t *pgdp, unsigned long addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l5_enabled())
|
|
|
|
return pgd_to_folded_p4d(pgdp, addr);
|
|
|
|
return p4d_set_fixmap(p4d_offset_phys(pgdp, addr));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void p4d_clear_fixmap(void)
|
|
|
|
{
|
|
|
|
if (pgtable_l5_enabled())
|
|
|
|
clear_fixmap(FIX_P4D);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* use ONLY for statically allocated translation tables */
|
|
|
|
static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr)
|
|
|
|
{
|
|
|
|
if (!pgtable_l5_enabled())
|
|
|
|
return pgd_to_folded_p4d(pgdp, addr);
|
|
|
|
return (p4d_t *)__phys_to_kimg(p4d_offset_phys(pgdp, addr));
|
|
|
|
}
|
|
|
|
|
2024-02-14 12:29:17 +00:00
|
|
|
#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline bool pgtable_l5_enabled(void) { return false; }
|
|
|
|
|
2024-02-14 12:29:20 +00:00
|
|
|
/* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */
|
|
|
|
#define p4d_set_fixmap(addr) NULL
|
|
|
|
#define p4d_set_fixmap_offset(p4dp, addr) ((p4d_t *)p4dp)
|
|
|
|
#define p4d_clear_fixmap()
|
|
|
|
|
|
|
|
#define p4d_offset_kimg(dir,addr) ((p4d_t *)dir)
|
|
|
|
|
2024-02-14 12:29:17 +00:00
|
|
|
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
|
|
|
|
|
arm64/mm: Refactor {pgd, pud, pmd, pte}_ERROR()
The function __{pgd, pud, pmd, pte}_error() are introduced so that
they can be called by {pgd, pud, pmd, pte}_ERROR(). However, some
of the functions could never be called when the corresponding page
table level isn't enabled. For example, __{pud, pmd}_error() are
unused when PUD and PMD are folded to PGD.
This removes __{pgd, pud, pmd, pte}_error() and call pr_err() from
{pgd, pud, pmd, pte}_ERROR() directly, similar to what x86/powerpc
are doing. With this, the code looks a bit simplified either.
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20200913234730.23145-1-gshan@redhat.com
Signed-off-by: Will Deacon <will@kernel.org>
2020-09-13 23:47:30 +00:00
|
|
|
#define pgd_ERROR(e) \
|
|
|
|
pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))
|
2014-07-21 13:52:49 +00:00
|
|
|
|
2016-01-25 11:45:07 +00:00
|
|
|
#define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
|
|
|
|
#define pgd_clear_fixmap() clear_fixmap(FIX_PGD)
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
|
|
|
|
{
|
2019-11-27 10:00:27 +00:00
|
|
|
/*
|
|
|
|
* Normal and Normal-Tagged are two different memory types and indices
|
|
|
|
* in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
|
|
|
|
*/
|
2012-12-18 14:15:15 +00:00
|
|
|
const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
|
2019-11-27 10:00:27 +00:00
|
|
|
PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP |
|
|
|
|
PTE_ATTRINDX_MASK;
|
2015-07-10 16:24:28 +00:00
|
|
|
/* preserve the hardware dirty information */
|
|
|
|
if (pte_hw_dirty(pte))
|
2023-07-13 07:15:18 +00:00
|
|
|
pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
|
arm64: mm: Always make sw-dirty PTEs hw-dirty in pte_modify
It is currently possible for a userspace application to enter an
infinite page fault loop when using HugeTLB pages implemented with
contiguous PTEs when HAFDBS is not available. This happens because:
1. The kernel may sometimes write PTEs that are sw-dirty but hw-clean
(PTE_DIRTY | PTE_RDONLY | PTE_WRITE).
2. If, during a write, the CPU uses a sw-dirty, hw-clean PTE in handling
the memory access on a system without HAFDBS, we will get a page
fault.
3. HugeTLB will check if it needs to update the dirty bits on the PTE.
For contiguous PTEs, it will check to see if the pgprot bits need
updating. In this case, HugeTLB wants to write a sequence of
sw-dirty, hw-dirty PTEs, but it finds that all the PTEs it is about
to overwrite are all pte_dirty() (pte_sw_dirty() => pte_dirty()),
so it thinks no update is necessary.
We can get the kernel to write a sw-dirty, hw-clean PTE with the
following steps (showing the relevant VMA flags and pgprot bits):
i. Create a valid, writable contiguous PTE.
VMA vmflags: VM_SHARED | VM_READ | VM_WRITE
VMA pgprot bits: PTE_RDONLY | PTE_WRITE
PTE pgprot bits: PTE_DIRTY | PTE_WRITE
ii. mprotect the VMA to PROT_NONE.
VMA vmflags: VM_SHARED
VMA pgprot bits: PTE_RDONLY
PTE pgprot bits: PTE_DIRTY | PTE_RDONLY
iii. mprotect the VMA back to PROT_READ | PROT_WRITE.
VMA vmflags: VM_SHARED | VM_READ | VM_WRITE
VMA pgprot bits: PTE_RDONLY | PTE_WRITE
PTE pgprot bits: PTE_DIRTY | PTE_WRITE | PTE_RDONLY
Make it impossible to create a writeable sw-dirty, hw-clean PTE with
pte_modify(). Such a PTE should be impossible to create, and there may
be places that assume that pte_dirty() implies pte_hw_dirty().
Signed-off-by: James Houghton <jthoughton@google.com>
Fixes: 031e6e6b4e12 ("arm64: hugetlb: Avoid unnecessary clearing in huge_ptep_set_access_flags")
Cc: <stable@vger.kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Link: https://lore.kernel.org/r/20231204172646.2541916-3-jthoughton@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2023-12-04 17:26:46 +00:00
|
|
|
/*
|
|
|
|
* If we end up clearing hw dirtiness for a sw-dirty PTE, set hardware
|
|
|
|
* dirtiness again.
|
|
|
|
*/
|
|
|
|
if (pte_sw_dirty(pte))
|
|
|
|
pte = pte_mkdirty(pte);
|
2012-03-05 11:49:27 +00:00
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2014-02-25 10:02:13 +00:00
|
|
|
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
|
|
|
|
{
|
|
|
|
return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
|
|
|
|
}
|
|
|
|
|
2016-04-13 15:01:22 +00:00
|
|
|
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
|
|
|
|
extern int ptep_set_access_flags(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *ptep,
|
|
|
|
pte_t entry, int dirty);
|
|
|
|
|
2016-05-05 09:44:00 +00:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
|
|
|
|
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pmd_t *pmdp,
|
|
|
|
pmd_t entry, int dirty)
|
|
|
|
{
|
|
|
|
return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
|
|
|
|
}
|
2019-07-16 23:30:51 +00:00
|
|
|
|
|
|
|
static inline int pud_devmap(pud_t pud)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int pgd_devmap(pgd_t pgd)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2016-05-05 09:44:00 +00:00
|
|
|
#endif
|
|
|
|
|
2022-05-17 07:45:48 +00:00
|
|
|
#ifdef CONFIG_PAGE_TABLE_CHECK
|
|
|
|
static inline bool pte_user_accessible_page(pte_t pte)
|
|
|
|
{
|
|
|
|
return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool pmd_user_accessible_page(pmd_t pmd)
|
|
|
|
{
|
2022-11-21 07:36:08 +00:00
|
|
|
return pmd_leaf(pmd) && !pmd_present_invalid(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
|
2022-05-17 07:45:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool pud_user_accessible_page(pud_t pud)
|
|
|
|
{
|
2022-11-22 12:31:37 +00:00
|
|
|
return pud_leaf(pud) && (pud_user(pud) || pud_user_exec(pud));
|
2022-05-17 07:45:48 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-07-10 16:24:28 +00:00
|
|
|
/*
|
|
|
|
* Atomic pte/pmd modifications.
|
|
|
|
*/
|
|
|
|
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
|
2016-04-13 16:57:37 +00:00
|
|
|
static inline int __ptep_test_and_clear_young(pte_t *ptep)
|
2015-07-10 16:24:28 +00:00
|
|
|
{
|
2017-06-26 13:27:36 +00:00
|
|
|
pte_t old_pte, pte;
|
2015-07-10 16:24:28 +00:00
|
|
|
|
2017-06-26 13:27:36 +00:00
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
do {
|
|
|
|
old_pte = pte;
|
|
|
|
pte = pte_mkold(pte);
|
|
|
|
pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
|
|
|
|
pte_val(old_pte), pte_val(pte));
|
|
|
|
} while (pte_val(pte) != pte_val(old_pte));
|
2015-07-10 16:24:28 +00:00
|
|
|
|
2017-06-26 13:27:36 +00:00
|
|
|
return pte_young(pte);
|
2015-07-10 16:24:28 +00:00
|
|
|
}
|
|
|
|
|
2016-04-13 16:57:37 +00:00
|
|
|
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
|
|
|
|
unsigned long address,
|
|
|
|
pte_t *ptep)
|
|
|
|
{
|
|
|
|
return __ptep_test_and_clear_young(ptep);
|
|
|
|
}
|
|
|
|
|
2018-10-29 09:25:58 +00:00
|
|
|
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
|
|
|
|
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pte_t *ptep)
|
|
|
|
{
|
|
|
|
int young = ptep_test_and_clear_young(vma, address, ptep);
|
|
|
|
|
|
|
|
if (young) {
|
|
|
|
/*
|
|
|
|
* We can elide the trailing DSB here since the worst that can
|
|
|
|
* happen is that a CPU continues to use the young entry in its
|
|
|
|
* TLB and we mistakenly reclaim the associated page. The
|
|
|
|
* window for such an event is bounded by the next
|
|
|
|
* context-switch, which provides a DSB to complete the TLB
|
|
|
|
* invalidation.
|
|
|
|
*/
|
|
|
|
flush_tlb_page_nosync(vma, address);
|
|
|
|
}
|
|
|
|
|
|
|
|
return young;
|
|
|
|
}
|
|
|
|
|
2015-07-10 16:24:28 +00:00
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
|
|
|
|
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
|
|
|
unsigned long address,
|
|
|
|
pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
|
|
|
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
|
|
|
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
|
|
|
unsigned long address, pte_t *ptep)
|
|
|
|
{
|
2022-05-13 03:23:06 +00:00
|
|
|
pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
|
|
|
|
|
2023-07-13 17:26:31 +00:00
|
|
|
page_table_check_pte_clear(mm, pte);
|
2022-05-13 03:23:06 +00:00
|
|
|
|
|
|
|
return pte;
|
2015-07-10 16:24:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
2016-05-05 09:43:59 +00:00
|
|
|
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
|
|
|
|
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
|
|
|
|
unsigned long address, pmd_t *pmdp)
|
2015-07-10 16:24:28 +00:00
|
|
|
{
|
2022-05-13 03:23:06 +00:00
|
|
|
pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0));
|
|
|
|
|
2023-07-13 17:26:32 +00:00
|
|
|
page_table_check_pmd_clear(mm, pmd);
|
2022-05-13 03:23:06 +00:00
|
|
|
|
|
|
|
return pmd;
|
2015-07-10 16:24:28 +00:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
|
|
|
|
/*
|
2017-12-01 17:22:14 +00:00
|
|
|
* ptep_set_wrprotect - mark read-only while trasferring potential hardware
|
|
|
|
* dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
|
2015-07-10 16:24:28 +00:00
|
|
|
*/
|
|
|
|
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
|
|
|
|
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
|
|
|
|
{
|
2017-06-26 13:27:36 +00:00
|
|
|
pte_t old_pte, pte;
|
|
|
|
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
do {
|
|
|
|
old_pte = pte;
|
|
|
|
pte = pte_wrprotect(pte);
|
|
|
|
pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
|
|
|
|
pte_val(old_pte), pte_val(pte));
|
|
|
|
} while (pte_val(pte) != pte_val(old_pte));
|
2015-07-10 16:24:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
|
|
|
|
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
|
|
|
|
unsigned long address, pmd_t *pmdp)
|
|
|
|
{
|
|
|
|
ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
|
|
|
|
}
|
2018-02-01 00:17:55 +00:00
|
|
|
|
|
|
|
#define pmdp_establish pmdp_establish
|
|
|
|
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, pmd_t *pmdp, pmd_t pmd)
|
|
|
|
{
|
2023-07-13 17:26:35 +00:00
|
|
|
page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
|
2018-02-01 00:17:55 +00:00
|
|
|
return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
|
|
|
|
}
|
2015-07-10 16:24:28 +00:00
|
|
|
#endif
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Encode and decode a swap entry:
|
2013-11-27 16:59:27 +00:00
|
|
|
* bits 0-1: present (must be zero)
|
2022-05-10 01:20:46 +00:00
|
|
|
* bits 2: remember PG_anon_exclusive
|
|
|
|
* bits 3-7: swap type
|
2015-02-10 22:10:15 +00:00
|
|
|
* bits 8-57: swap offset
|
2016-03-09 16:31:29 +00:00
|
|
|
* bit 58: PTE_PROT_NONE (must be zero)
|
2012-03-05 11:49:27 +00:00
|
|
|
*/
|
2022-05-10 01:20:46 +00:00
|
|
|
#define __SWP_TYPE_SHIFT 3
|
|
|
|
#define __SWP_TYPE_BITS 5
|
2015-02-10 22:10:15 +00:00
|
|
|
#define __SWP_OFFSET_BITS 50
|
2012-03-05 11:49:27 +00:00
|
|
|
#define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1)
|
|
|
|
#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
|
2013-11-27 16:59:27 +00:00
|
|
|
#define __SWP_OFFSET_MASK ((1UL << __SWP_OFFSET_BITS) - 1)
|
2012-03-05 11:49:27 +00:00
|
|
|
|
|
|
|
#define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
|
2013-11-27 16:59:27 +00:00
|
|
|
#define __swp_offset(x) (((x).val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK)
|
2012-03-05 11:49:27 +00:00
|
|
|
#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) })
|
|
|
|
|
|
|
|
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
|
|
|
|
#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val })
|
|
|
|
|
2020-09-09 04:53:03 +00:00
|
|
|
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
|
|
|
|
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) })
|
|
|
|
#define __swp_entry_to_pmd(swp) __pmd((swp).val)
|
|
|
|
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
|
|
|
|
|
2012-03-05 11:49:27 +00:00
|
|
|
/*
|
|
|
|
* Ensure that there are not more swap files than can be encoded in the kernel
|
2014-03-11 10:23:39 +00:00
|
|
|
* PTEs.
|
2012-03-05 11:49:27 +00:00
|
|
|
*/
|
|
|
|
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
|
|
|
|
|
2020-05-13 15:37:50 +00:00
|
|
|
#ifdef CONFIG_ARM64_MTE
|
|
|
|
|
|
|
|
#define __HAVE_ARCH_PREPARE_TO_SWAP
|
|
|
|
static inline int arch_prepare_to_swap(struct page *page)
|
|
|
|
{
|
|
|
|
if (system_supports_mte())
|
|
|
|
return mte_save_tags(page);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define __HAVE_ARCH_SWAP_INVALIDATE
|
|
|
|
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
|
|
|
|
{
|
|
|
|
if (system_supports_mte())
|
|
|
|
mte_invalidate_tags(type, offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void arch_swap_invalidate_area(int type)
|
|
|
|
{
|
|
|
|
if (system_supports_mte())
|
|
|
|
mte_invalidate_tags_area(type);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define __HAVE_ARCH_SWAP_RESTORE
|
2022-05-13 03:23:05 +00:00
|
|
|
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
|
2020-05-13 15:37:50 +00:00
|
|
|
{
|
2022-11-04 01:10:38 +00:00
|
|
|
if (system_supports_mte())
|
|
|
|
mte_restore_tags(entry, &folio->page);
|
2020-05-13 15:37:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_ARM64_MTE */
|
|
|
|
|
2015-07-16 18:26:02 +00:00
|
|
|
/*
|
|
|
|
* On AArch64, the cache coherency is handled via the set_pte_at() function.
|
|
|
|
*/
|
2023-08-02 15:13:38 +00:00
|
|
|
static inline void update_mmu_cache_range(struct vm_fault *vmf,
|
|
|
|
struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
|
|
|
|
unsigned int nr)
|
2015-07-16 18:26:02 +00:00
|
|
|
{
|
|
|
|
/*
|
2015-10-06 17:46:30 +00:00
|
|
|
* We don't do anything here, so there's a very small chance of
|
|
|
|
* us retaking a user fault which we just fixed up. The alternative
|
|
|
|
* is doing a dsb(ishst), but that penalises the fastpath.
|
2015-07-16 18:26:02 +00:00
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
2023-08-02 15:13:38 +00:00
|
|
|
#define update_mmu_cache(vma, addr, ptep) \
|
|
|
|
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
|
2015-07-16 18:26:02 +00:00
|
|
|
#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
|
|
|
|
|
2017-12-13 17:07:18 +00:00
|
|
|
#ifdef CONFIG_ARM64_PA_BITS_52
|
|
|
|
#define phys_to_ttbr(addr) (((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52)
|
|
|
|
#else
|
|
|
|
#define phys_to_ttbr(addr) (addr)
|
|
|
|
#endif
|
|
|
|
|
2019-10-11 14:09:37 +00:00
|
|
|
/*
|
|
|
|
* On arm64 without hardware Access Flag, copying from user will fail because
|
|
|
|
* the pte is old and cannot be marked young. So we always end up with zeroed
|
|
|
|
* page after fork() + CoW for pfn mappings. We don't always have a
|
|
|
|
* hardware-managed access flag on arm64.
|
|
|
|
*/
|
mm: x86, arm64: add arch_has_hw_pte_young()
Patch series "Multi-Gen LRU Framework", v14.
What's new
==========
1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
machines. The old direct reclaim backoff, which tries to enforce a
minimum fairness among all eligible memcgs, over-swapped by about
(total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
pulls the plug on swapping once the target is met, trades some
fairness for curtailed latency:
https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/
3. Fixed minior build warnings and conflicts. More comments and nits.
TLDR
====
The current page reclaim is too expensive in terms of CPU usage and it
often makes poor choices about what to evict. This patchset offers an
alternative solution that is performant, versatile and
straightforward.
Patchset overview
=================
The design and implementation overview is in patch 14:
https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/
01. mm: x86, arm64: add arch_has_hw_pte_young()
02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
Take advantage of hardware features when trying to clear the accessed
bit in many PTEs.
03. mm/vmscan.c: refactor shrink_node()
04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
its sole caller"
Minor refactors to improve readability for the following patches.
05. mm: multi-gen LRU: groundwork
Adds the basic data structure and the functions that insert pages to
and remove pages from the multi-gen LRU (MGLRU) lists.
06. mm: multi-gen LRU: minimal implementation
A minimal implementation without optimizations.
07. mm: multi-gen LRU: exploit locality in rmap
Exploits spatial locality to improve efficiency when using the rmap.
08. mm: multi-gen LRU: support page table walks
Further exploits spatial locality by optionally scanning page tables.
09. mm: multi-gen LRU: optimize multiple memcgs
Optimizes the overall performance for multiple memcgs running mixed
types of workloads.
10. mm: multi-gen LRU: kill switch
Adds a kill switch to enable or disable MGLRU at runtime.
11. mm: multi-gen LRU: thrashing prevention
12. mm: multi-gen LRU: debugfs interface
Provide userspace with features like thrashing prevention, working set
estimation and proactive reclaim.
13. mm: multi-gen LRU: admin guide
14. mm: multi-gen LRU: design doc
Add an admin guide and a design doc.
Benchmark results
=================
Independent lab results
-----------------------
Based on the popularity of searches [01] and the memory usage in
Google's public cloud, the most popular open-source memory-hungry
applications, in alphabetical order, are:
Apache Cassandra Memcached
Apache Hadoop MongoDB
Apache Spark PostgreSQL
MariaDB (MySQL) Redis
An independent lab evaluated MGLRU with the most widely used benchmark
suites for the above applications. They posted 960 data points along
with kernel metrics and perf profiles collected over more than 500
hours of total benchmark time. Their final reports show that, with 95%
confidence intervals (CIs), the above applications all performed
significantly better for at least part of their benchmark matrices.
On 5.14:
1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
less wall time to sort three billion random integers, respectively,
under the medium- and the high-concurrency conditions, when
overcommitting memory. There were no statistically significant
changes in wall time for the rest of the benchmark matrix.
2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
more transactions per minute (TPM), respectively, under the medium-
and the high-concurrency conditions, when overcommitting memory.
There were no statistically significant changes in TPM for the rest
of the benchmark matrix.
3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
and [21.59, 30.02]% more operations per second (OPS), respectively,
for sequential access, random access and Gaussian (distribution)
access, when THP=always; 95% CIs [13.85, 15.97]% and
[23.94, 29.92]% more OPS, respectively, for random access and
Gaussian access, when THP=never. There were no statistically
significant changes in OPS for the rest of the benchmark matrix.
4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
[2.16, 3.55]% more operations per second (OPS), respectively, for
exponential (distribution) access, random access and Zipfian
(distribution) access, when underutilizing memory; 95% CIs
[8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
respectively, for exponential access, random access and Zipfian
access, when overcommitting memory.
On 5.15:
5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
and [4.11, 7.50]% more operations per second (OPS), respectively,
for exponential (distribution) access, random access and Zipfian
(distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
[6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
exponential access, random access and Zipfian access, when swap was
on.
6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
less average wall time to finish twelve parallel TeraSort jobs,
respectively, under the medium- and the high-concurrency
conditions, when swap was on. There were no statistically
significant changes in average wall time for the rest of the
benchmark matrix.
7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
minute (TPM) under the high-concurrency condition, when swap was
off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
respectively, under the medium- and the high-concurrency
conditions, when swap was on. There were no statistically
significant changes in TPM for the rest of the benchmark matrix.
8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
[11.47, 19.36]% more total operations per second (OPS),
respectively, for sequential access, random access and Gaussian
(distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
[10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
for sequential access, random access and Gaussian access, when
THP=never.
Our lab results
---------------
To supplement the above results, we ran the following benchmark suites
on 5.16-rc7 and found no regressions [10].
fs_fio_bench_hdd_mq pft
fs_lmbench pgsql-hammerdb
fs_parallelio redis
fs_postmark stream
hackbench sysbenchthread
kernbench tpcc_spark
memcached unixbench
multichase vm-scalability
mutilate will-it-scale
nginx
[01] https://trends.google.com
[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/
[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/
[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/
[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/
[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/
[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/
[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/
[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/
[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/
Read-world applications
=======================
Third-party testimonials
------------------------
Konstantin reported [11]:
I have Archlinux with 8G RAM + zswap + swap. While developing, I
have lots of apps opened such as multiple LSP-servers for different
langs, chats, two browsers, etc... Usually, my system gets quickly
to a point of SWAP-storms, where I have to kill LSP-servers,
restart browsers to free memory, etc, otherwise the system lags
heavily and is barely usable.
1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
patchset, and I started up by opening lots of apps to create memory
pressure, and worked for a day like this. Till now I had not a
single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
getting to the point of 3G in SWAP before without a single
SWAP-storm.
Vaibhav from IBM reported [12]:
In a synthetic MongoDB Benchmark, seeing an average of ~19%
throughput improvement on POWER10(Radix MMU + 64K Page Size) with
MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
three different request distributions, namely, Exponential, Uniform
and Zipfan.
Shuang from U of Rochester reported [13]:
With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
and [9.26, 10.36]% higher throughput, respectively, for random
access, Zipfian (distribution) access and Gaussian (distribution)
access, when the average number of jobs per CPU is 1; 95% CIs
[42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
throughput, respectively, for random access, Zipfian access and
Gaussian access, when the average number of jobs per CPU is 2.
Daniel from Michigan Tech reported [14]:
With Memcached allocating ~100GB of byte-addressable Optante,
performance improvement in terms of throughput (measured as queries
per second) was about 10% for a series of workloads.
Large-scale deployments
-----------------------
We've rolled out MGLRU to tens of millions of ChromeOS users and
about a million Android users. Google's fleetwide profiling [15] shows
an overall 40% decrease in kswapd CPU usage, in addition to
improvements in other UX metrics, e.g., an 85% decrease in the number
of low-memory kills at the 75th percentile and an 18% decrease in
app launch time at the 50th percentile.
The downstream kernels that have been using MGLRU include:
1. Android [16]
2. Arch Linux Zen [17]
3. Armbian [18]
4. ChromeOS [19]
5. Liquorix [20]
6. OpenWrt [21]
7. post-factum [22]
8. XanMod [23]
[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/
[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/
[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/
[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
[15] https://dl.acm.org/doi/10.1145/2749469.2750392
[16] https://android.com
[17] https://archlinux.org
[18] https://armbian.com
[19] https://chromium.org
[20] https://liquorix.net
[21] https://openwrt.org
[22] https://codeberg.org/pf-kernel
[23] https://xanmod.org
Summary
=======
The facts are:
1. The independent lab results and the real-world applications
indicate substantial improvements; there are no known regressions.
2. Thrashing prevention, working set estimation and proactive reclaim
work out of the box; there are no equivalent solutions.
3. There is a lot of new code; no smaller changes have been
demonstrated similar effects.
Our options, accordingly, are:
1. Given the amount of evidence, the reported improvements will likely
materialize for a wide range of workloads.
2. Gauging the interest from the past discussions, the new features
will likely be put to use for both personal computers and data
centers.
3. Based on Google's track record, the new code will likely be well
maintained in the long term. It'd be more difficult if not
impossible to achieve similar effects with other approaches.
This patch (of 14):
Some architectures automatically set the accessed bit in PTEs, e.g., x86
and arm64 v8.2. On architectures that do not have this capability,
clearing the accessed bit in a PTE usually triggers a page fault following
the TLB miss of this PTE (to emulate the accessed bit).
Being aware of this capability can help make better decisions, e.g.,
whether to spread the work out over a period of time to reduce bursty page
faults when trying to clear the accessed bit in many PTEs.
Note that theoretically this capability can be unreliable, e.g.,
hotplugged CPUs might be different from builtin ones. Therefore it should
not be used in architecture-independent code that involves correctness,
e.g., to determine whether TLB flushes are required (in combination with
the accessed bit).
Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com
Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-18 07:59:58 +00:00
|
|
|
#define arch_has_hw_pte_young cpu_has_hw_af
|
2020-11-24 18:49:26 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Experimentally, it's cheap to set the access flag in hardware and we
|
|
|
|
* benefit from prefaulting mappings as 'old' to start with.
|
|
|
|
*/
|
mm: x86, arm64: add arch_has_hw_pte_young()
Patch series "Multi-Gen LRU Framework", v14.
What's new
==========
1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
machines. The old direct reclaim backoff, which tries to enforce a
minimum fairness among all eligible memcgs, over-swapped by about
(total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
pulls the plug on swapping once the target is met, trades some
fairness for curtailed latency:
https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/
3. Fixed minior build warnings and conflicts. More comments and nits.
TLDR
====
The current page reclaim is too expensive in terms of CPU usage and it
often makes poor choices about what to evict. This patchset offers an
alternative solution that is performant, versatile and
straightforward.
Patchset overview
=================
The design and implementation overview is in patch 14:
https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/
01. mm: x86, arm64: add arch_has_hw_pte_young()
02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
Take advantage of hardware features when trying to clear the accessed
bit in many PTEs.
03. mm/vmscan.c: refactor shrink_node()
04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
its sole caller"
Minor refactors to improve readability for the following patches.
05. mm: multi-gen LRU: groundwork
Adds the basic data structure and the functions that insert pages to
and remove pages from the multi-gen LRU (MGLRU) lists.
06. mm: multi-gen LRU: minimal implementation
A minimal implementation without optimizations.
07. mm: multi-gen LRU: exploit locality in rmap
Exploits spatial locality to improve efficiency when using the rmap.
08. mm: multi-gen LRU: support page table walks
Further exploits spatial locality by optionally scanning page tables.
09. mm: multi-gen LRU: optimize multiple memcgs
Optimizes the overall performance for multiple memcgs running mixed
types of workloads.
10. mm: multi-gen LRU: kill switch
Adds a kill switch to enable or disable MGLRU at runtime.
11. mm: multi-gen LRU: thrashing prevention
12. mm: multi-gen LRU: debugfs interface
Provide userspace with features like thrashing prevention, working set
estimation and proactive reclaim.
13. mm: multi-gen LRU: admin guide
14. mm: multi-gen LRU: design doc
Add an admin guide and a design doc.
Benchmark results
=================
Independent lab results
-----------------------
Based on the popularity of searches [01] and the memory usage in
Google's public cloud, the most popular open-source memory-hungry
applications, in alphabetical order, are:
Apache Cassandra Memcached
Apache Hadoop MongoDB
Apache Spark PostgreSQL
MariaDB (MySQL) Redis
An independent lab evaluated MGLRU with the most widely used benchmark
suites for the above applications. They posted 960 data points along
with kernel metrics and perf profiles collected over more than 500
hours of total benchmark time. Their final reports show that, with 95%
confidence intervals (CIs), the above applications all performed
significantly better for at least part of their benchmark matrices.
On 5.14:
1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
less wall time to sort three billion random integers, respectively,
under the medium- and the high-concurrency conditions, when
overcommitting memory. There were no statistically significant
changes in wall time for the rest of the benchmark matrix.
2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
more transactions per minute (TPM), respectively, under the medium-
and the high-concurrency conditions, when overcommitting memory.
There were no statistically significant changes in TPM for the rest
of the benchmark matrix.
3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
and [21.59, 30.02]% more operations per second (OPS), respectively,
for sequential access, random access and Gaussian (distribution)
access, when THP=always; 95% CIs [13.85, 15.97]% and
[23.94, 29.92]% more OPS, respectively, for random access and
Gaussian access, when THP=never. There were no statistically
significant changes in OPS for the rest of the benchmark matrix.
4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
[2.16, 3.55]% more operations per second (OPS), respectively, for
exponential (distribution) access, random access and Zipfian
(distribution) access, when underutilizing memory; 95% CIs
[8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
respectively, for exponential access, random access and Zipfian
access, when overcommitting memory.
On 5.15:
5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
and [4.11, 7.50]% more operations per second (OPS), respectively,
for exponential (distribution) access, random access and Zipfian
(distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
[6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
exponential access, random access and Zipfian access, when swap was
on.
6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
less average wall time to finish twelve parallel TeraSort jobs,
respectively, under the medium- and the high-concurrency
conditions, when swap was on. There were no statistically
significant changes in average wall time for the rest of the
benchmark matrix.
7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
minute (TPM) under the high-concurrency condition, when swap was
off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
respectively, under the medium- and the high-concurrency
conditions, when swap was on. There were no statistically
significant changes in TPM for the rest of the benchmark matrix.
8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
[11.47, 19.36]% more total operations per second (OPS),
respectively, for sequential access, random access and Gaussian
(distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
[10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
for sequential access, random access and Gaussian access, when
THP=never.
Our lab results
---------------
To supplement the above results, we ran the following benchmark suites
on 5.16-rc7 and found no regressions [10].
fs_fio_bench_hdd_mq pft
fs_lmbench pgsql-hammerdb
fs_parallelio redis
fs_postmark stream
hackbench sysbenchthread
kernbench tpcc_spark
memcached unixbench
multichase vm-scalability
mutilate will-it-scale
nginx
[01] https://trends.google.com
[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/
[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/
[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/
[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/
[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/
[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/
[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/
[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/
[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/
Read-world applications
=======================
Third-party testimonials
------------------------
Konstantin reported [11]:
I have Archlinux with 8G RAM + zswap + swap. While developing, I
have lots of apps opened such as multiple LSP-servers for different
langs, chats, two browsers, etc... Usually, my system gets quickly
to a point of SWAP-storms, where I have to kill LSP-servers,
restart browsers to free memory, etc, otherwise the system lags
heavily and is barely usable.
1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
patchset, and I started up by opening lots of apps to create memory
pressure, and worked for a day like this. Till now I had not a
single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
getting to the point of 3G in SWAP before without a single
SWAP-storm.
Vaibhav from IBM reported [12]:
In a synthetic MongoDB Benchmark, seeing an average of ~19%
throughput improvement on POWER10(Radix MMU + 64K Page Size) with
MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
three different request distributions, namely, Exponential, Uniform
and Zipfan.
Shuang from U of Rochester reported [13]:
With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
and [9.26, 10.36]% higher throughput, respectively, for random
access, Zipfian (distribution) access and Gaussian (distribution)
access, when the average number of jobs per CPU is 1; 95% CIs
[42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
throughput, respectively, for random access, Zipfian access and
Gaussian access, when the average number of jobs per CPU is 2.
Daniel from Michigan Tech reported [14]:
With Memcached allocating ~100GB of byte-addressable Optante,
performance improvement in terms of throughput (measured as queries
per second) was about 10% for a series of workloads.
Large-scale deployments
-----------------------
We've rolled out MGLRU to tens of millions of ChromeOS users and
about a million Android users. Google's fleetwide profiling [15] shows
an overall 40% decrease in kswapd CPU usage, in addition to
improvements in other UX metrics, e.g., an 85% decrease in the number
of low-memory kills at the 75th percentile and an 18% decrease in
app launch time at the 50th percentile.
The downstream kernels that have been using MGLRU include:
1. Android [16]
2. Arch Linux Zen [17]
3. Armbian [18]
4. ChromeOS [19]
5. Liquorix [20]
6. OpenWrt [21]
7. post-factum [22]
8. XanMod [23]
[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/
[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/
[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/
[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
[15] https://dl.acm.org/doi/10.1145/2749469.2750392
[16] https://android.com
[17] https://archlinux.org
[18] https://armbian.com
[19] https://chromium.org
[20] https://liquorix.net
[21] https://openwrt.org
[22] https://codeberg.org/pf-kernel
[23] https://xanmod.org
Summary
=======
The facts are:
1. The independent lab results and the real-world applications
indicate substantial improvements; there are no known regressions.
2. Thrashing prevention, working set estimation and proactive reclaim
work out of the box; there are no equivalent solutions.
3. There is a lot of new code; no smaller changes have been
demonstrated similar effects.
Our options, accordingly, are:
1. Given the amount of evidence, the reported improvements will likely
materialize for a wide range of workloads.
2. Gauging the interest from the past discussions, the new features
will likely be put to use for both personal computers and data
centers.
3. Based on Google's track record, the new code will likely be well
maintained in the long term. It'd be more difficult if not
impossible to achieve similar effects with other approaches.
This patch (of 14):
Some architectures automatically set the accessed bit in PTEs, e.g., x86
and arm64 v8.2. On architectures that do not have this capability,
clearing the accessed bit in a PTE usually triggers a page fault following
the TLB miss of this PTE (to emulate the accessed bit).
Being aware of this capability can help make better decisions, e.g.,
whether to spread the work out over a period of time to reduce bursty page
faults when trying to clear the accessed bit in many PTEs.
Note that theoretically this capability can be unreliable, e.g.,
hotplugged CPUs might be different from builtin ones. Therefore it should
not be used in architecture-independent code that involves correctness,
e.g., to determine whether TLB flushes are required (in combination with
the accessed bit).
Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com
Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Larabel <Michael@MichaelLarabel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-18 07:59:58 +00:00
|
|
|
#define arch_wants_old_prefaulted_pte cpu_has_hw_af
|
2019-10-11 14:09:37 +00:00
|
|
|
|
2021-09-20 09:29:31 +00:00
|
|
|
static inline bool pud_sect_supported(void)
|
|
|
|
{
|
|
|
|
return PAGE_SIZE == SZ_4K;
|
|
|
|
}
|
|
|
|
|
2021-03-12 17:38:10 +00:00
|
|
|
|
2023-01-02 06:16:51 +00:00
|
|
|
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
|
|
|
|
#define ptep_modify_prot_start ptep_modify_prot_start
|
|
|
|
extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pte_t *ptep);
|
|
|
|
|
|
|
|
#define ptep_modify_prot_commit ptep_modify_prot_commit
|
|
|
|
extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr, pte_t *ptep,
|
|
|
|
pte_t old_pte, pte_t new_pte);
|
2012-03-05 11:49:27 +00:00
|
|
|
#endif /* !__ASSEMBLY__ */
|
|
|
|
|
|
|
|
#endif /* __ASM_PGTABLE_H */
|