linux-stable/arch/riscv/include/asm/pgtable.h

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 Regents of the University of California
 */

#ifndef _ASM_RISCV_PGTABLE_H
#define _ASM_RISCV_PGTABLE_H

#include <linux/mmzone.h>
#include <linux/sizes.h>

#include <asm/pgtable-bits.h>

#ifndef CONFIG_MMU
#define KERNEL_LINK_ADDR	PAGE_OFFSET
#define KERN_VIRT_SIZE		(UL(-1))
#else

#define ADDRESS_SPACE_END	(UL(-1))

#ifdef CONFIG_64BIT
/* Leave 2GB for kernel and BPF at the end of the address space */
#define KERNEL_LINK_ADDR	(ADDRESS_SPACE_END - SZ_2G + 1)
#else
#define KERNEL_LINK_ADDR	PAGE_OFFSET
#endif

/* Number of entries in the page global directory */
#define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
/* Number of entries in the page table */
#define PTRS_PER_PTE    (PAGE_SIZE / sizeof(pte_t))

/*
 * Half of the kernel address space (1/4 of the entries of the page global
 * directory) is for the direct mapping.
 */
#define KERN_VIRT_SIZE          ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2)

#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END      PAGE_OFFSET
#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)

#define BPF_JIT_REGION_SIZE	(SZ_128M)
#ifdef CONFIG_64BIT
#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END	(MODULES_END)
#else
#define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)
#define BPF_JIT_REGION_END	(VMALLOC_END)
#endif

/* Modules always live before the kernel */
#ifdef CONFIG_64BIT
/* This is used to define the end of the KASAN shadow region */
#define MODULES_LOWEST_VADDR	(KERNEL_LINK_ADDR - SZ_2G)
#define MODULES_VADDR		(PFN_ALIGN((unsigned long)&_end) - SZ_2G)
#define MODULES_END		(PFN_ALIGN((unsigned long)&_start))
#endif

/*
 * Roughly size the vmemmap space to be large enough to fit enough
 * struct pages to map half the virtual address space. Then
 * position vmemmap directly below the VMALLOC region.
 */
#define VA_BITS_SV32 32
#ifdef CONFIG_64BIT
#define VA_BITS_SV39 39
#define VA_BITS_SV48 48
#define VA_BITS_SV57 57

#define VA_BITS		(pgtable_l5_enabled ? \
				VA_BITS_SV57 : (pgtable_l4_enabled ? VA_BITS_SV48 : VA_BITS_SV39))
#else
#define VA_BITS		VA_BITS_SV32
#endif

#define VMEMMAP_SHIFT \
	(VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
#define VMEMMAP_SIZE	BIT(VMEMMAP_SHIFT)
#define VMEMMAP_END	VMALLOC_START
#define VMEMMAP_START	(VMALLOC_START - VMEMMAP_SIZE)

/*
 * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel
 * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled.
 */
#define vmemmap		((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT))

#define PCI_IO_SIZE      SZ_16M
#define PCI_IO_END       VMEMMAP_START
#define PCI_IO_START     (PCI_IO_END - PCI_IO_SIZE)

#define FIXADDR_TOP      PCI_IO_START
#ifdef CONFIG_64BIT
#define MAX_FDT_SIZE	 PMD_SIZE
#define FIX_FDT_SIZE	 (MAX_FDT_SIZE + SZ_2M)
#define FIXADDR_SIZE     (PMD_SIZE + FIX_FDT_SIZE)
#else
#define MAX_FDT_SIZE	 PGDIR_SIZE
#define FIX_FDT_SIZE	 MAX_FDT_SIZE
#define FIXADDR_SIZE     (PGDIR_SIZE + FIX_FDT_SIZE)
#endif
#define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)

#endif

#ifdef CONFIG_XIP_KERNEL
#define XIP_OFFSET		SZ_32M
#define XIP_OFFSET_MASK		(SZ_32M - 1)
#else
#define XIP_OFFSET		0
#endif

#ifndef __ASSEMBLY__

#include <asm/page.h>
#include <asm/tlbflush.h>
#include <linux/mm_types.h>
#include <asm/compat.h>

#define __page_val_to_pfn(_val)  (((_val) & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT)

#ifdef CONFIG_64BIT
#include <asm/pgtable-64.h>

#define VA_USER_SV39 (UL(1) << (VA_BITS_SV39 - 1))
#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))

#define MMAP_VA_BITS_64 ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
#define MMAP_MIN_VA_BITS_64 (VA_BITS_SV39)
#define MMAP_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_VA_BITS_64)
#define MMAP_MIN_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_MIN_VA_BITS_64)
#else
#include <asm/pgtable-32.h>
#endif /* CONFIG_64BIT */

#include <linux/page_table_check.h>

#ifdef CONFIG_XIP_KERNEL
#define XIP_FIXUP(addr) ({							\
	uintptr_t __a = (uintptr_t)(addr);					\
	(__a >= CONFIG_XIP_PHYS_ADDR && \
	 __a < CONFIG_XIP_PHYS_ADDR + XIP_OFFSET * 2) ?	\
		__a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\
		__a;								\
	})
#else
#define XIP_FIXUP(addr)		(addr)
#endif /* CONFIG_XIP_KERNEL */

struct pt_alloc_ops {
	pte_t *(*get_pte_virt)(phys_addr_t pa);
	phys_addr_t (*alloc_pte)(uintptr_t va);
#ifndef __PAGETABLE_PMD_FOLDED
	pmd_t *(*get_pmd_virt)(phys_addr_t pa);
	phys_addr_t (*alloc_pmd)(uintptr_t va);
	pud_t *(*get_pud_virt)(phys_addr_t pa);
	phys_addr_t (*alloc_pud)(uintptr_t va);
	p4d_t *(*get_p4d_virt)(phys_addr_t pa);
	phys_addr_t (*alloc_p4d)(uintptr_t va);
#endif
};

extern struct pt_alloc_ops pt_ops __initdata;

#ifdef CONFIG_MMU
/* Number of PGD entries that a user-mode program can use */
#define USER_PTRS_PER_PGD   (TASK_SIZE / PGDIR_SIZE)

/* Page protection bits */
#define _PAGE_BASE	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER)

#define PAGE_NONE		__pgprot(_PAGE_PROT_NONE | _PAGE_READ)
#define PAGE_READ		__pgprot(_PAGE_BASE | _PAGE_READ)
#define PAGE_WRITE		__pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE)
#define PAGE_EXEC		__pgprot(_PAGE_BASE | _PAGE_EXEC)
#define PAGE_READ_EXEC		__pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
#define PAGE_WRITE_EXEC		__pgprot(_PAGE_BASE | _PAGE_READ |	\
					 _PAGE_EXEC | _PAGE_WRITE)

#define PAGE_COPY		PAGE_READ
#define PAGE_COPY_EXEC		PAGE_READ_EXEC
#define PAGE_SHARED		PAGE_WRITE
#define PAGE_SHARED_EXEC	PAGE_WRITE_EXEC

#define _PAGE_KERNEL		(_PAGE_READ \
				| _PAGE_WRITE \
				| _PAGE_PRESENT \
				| _PAGE_ACCESSED \
				| _PAGE_DIRTY \
				| _PAGE_GLOBAL)

#define PAGE_KERNEL		__pgprot(_PAGE_KERNEL)
#define PAGE_KERNEL_READ	__pgprot(_PAGE_KERNEL & ~_PAGE_WRITE)
#define PAGE_KERNEL_EXEC	__pgprot(_PAGE_KERNEL | _PAGE_EXEC)
#define PAGE_KERNEL_READ_EXEC	__pgprot((_PAGE_KERNEL & ~_PAGE_WRITE) \
					 | _PAGE_EXEC)

#define PAGE_TABLE		__pgprot(_PAGE_TABLE)

#define _PAGE_IOREMAP	((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_IO)
#define PAGE_KERNEL_IO		__pgprot(_PAGE_IOREMAP)

extern pgd_t swapper_pg_dir[];
extern pgd_t trampoline_pg_dir[];
extern pgd_t early_pg_dir[];

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_present(pmd_t pmd)
{
	/*
	 * Checking for _PAGE_LEAF is needed too because:
	 * When splitting a THP, split_huge_page() will temporarily clear
	 * the present bit, in this situation, pmd_present() and
	 * pmd_trans_huge() still needs to return true.
	 */
	return (pmd_val(pmd) & (_PAGE_PRESENT | _PAGE_PROT_NONE | _PAGE_LEAF));
}
#else
static inline int pmd_present(pmd_t pmd)
{
	return (pmd_val(pmd) & (_PAGE_PRESENT | _PAGE_PROT_NONE));
}
#endif

static inline int pmd_none(pmd_t pmd)
{
	return (pmd_val(pmd) == 0);
}

static inline int pmd_bad(pmd_t pmd)
{
	return !pmd_present(pmd) || (pmd_val(pmd) & _PAGE_LEAF);
}

#define pmd_leaf	pmd_leaf
static inline bool pmd_leaf(pmd_t pmd)
{
	return pmd_present(pmd) && (pmd_val(pmd) & _PAGE_LEAF);
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
	WRITE_ONCE(*pmdp, pmd);
}

static inline void pmd_clear(pmd_t *pmdp)
{
	set_pmd(pmdp, __pmd(0));
}

static inline pgd_t pfn_pgd(unsigned long pfn, pgprot_t prot)
{
	unsigned long prot_val = pgprot_val(prot);

	ALT_THEAD_PMA(prot_val);

	return __pgd((pfn << _PAGE_PFN_SHIFT) | prot_val);
}

static inline unsigned long _pgd_pfn(pgd_t pgd)
{
	return __page_val_to_pfn(pgd_val(pgd));
}

static inline struct page *pmd_page(pmd_t pmd)
{
	return pfn_to_page(__page_val_to_pfn(pmd_val(pmd)));
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pmd_val(pmd)));
}

static inline pte_t pmd_pte(pmd_t pmd)
{
	return __pte(pmd_val(pmd));
}

static inline pte_t pud_pte(pud_t pud)
{
	return __pte(pud_val(pud));
}

#ifdef CONFIG_RISCV_ISA_SVNAPOT
#include <asm/cpufeature.h>

static __always_inline bool has_svnapot(void)
{
	return riscv_has_extension_likely(RISCV_ISA_EXT_SVNAPOT);
}

static inline unsigned long pte_napot(pte_t pte)
{
	return pte_val(pte) & _PAGE_NAPOT;
}

static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
{
	int pos = order - 1 + _PAGE_PFN_SHIFT;
	unsigned long napot_bit = BIT(pos);
	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);

	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
}

#else

static __always_inline bool has_svnapot(void) { return false; }

static inline unsigned long pte_napot(pte_t pte)
{
	return 0;
}

#endif /* CONFIG_RISCV_ISA_SVNAPOT */

/* Yields the page frame number (PFN) of a page table entry */
static inline unsigned long pte_pfn(pte_t pte)
{
	unsigned long res  = __page_val_to_pfn(pte_val(pte));

	if (has_svnapot() && pte_napot(pte))
		res = res & (res - 1UL);

	return res;
}

#define pte_page(x)     pfn_to_page(pte_pfn(x))

/* Constructs a page table entry */
static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
{
	unsigned long prot_val = pgprot_val(prot);

	ALT_THEAD_PMA(prot_val);

	return __pte((pfn << _PAGE_PFN_SHIFT) | prot_val);
}

#define mk_pte(page, prot)       pfn_pte(page_to_pfn(page), prot)

static inline int pte_present(pte_t pte)
{
	return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE));
}

static inline int pte_none(pte_t pte)
{
	return (pte_val(pte) == 0);
}

static inline int pte_write(pte_t pte)
{
	return pte_val(pte) & _PAGE_WRITE;
}

static inline int pte_exec(pte_t pte)
{
	return pte_val(pte) & _PAGE_EXEC;
}

static inline int pte_user(pte_t pte)
{
	return pte_val(pte) & _PAGE_USER;
}

static inline int pte_huge(pte_t pte)
{
	return pte_present(pte) && (pte_val(pte) & _PAGE_LEAF);
}

static inline int pte_dirty(pte_t pte)
{
	return pte_val(pte) & _PAGE_DIRTY;
}

static inline int pte_young(pte_t pte)
{
	return pte_val(pte) & _PAGE_ACCESSED;
}

static inline int pte_special(pte_t pte)
{
	return pte_val(pte) & _PAGE_SPECIAL;
}

/* static inline pte_t pte_rdprotect(pte_t pte) */

static inline pte_t pte_wrprotect(pte_t pte)
{
	return __pte(pte_val(pte) & ~(_PAGE_WRITE));
}

/* static inline pte_t pte_mkread(pte_t pte) */

static inline pte_t pte_mkwrite_novma(pte_t pte)
{
	return __pte(pte_val(pte) | _PAGE_WRITE);
}

/* static inline pte_t pte_mkexec(pte_t pte) */

static inline pte_t pte_mkdirty(pte_t pte)
{
	return __pte(pte_val(pte) | _PAGE_DIRTY);
}

static inline pte_t pte_mkclean(pte_t pte)
{
	return __pte(pte_val(pte) & ~(_PAGE_DIRTY));
}

static inline pte_t pte_mkyoung(pte_t pte)
{
	return __pte(pte_val(pte) | _PAGE_ACCESSED);
}

static inline pte_t pte_mkold(pte_t pte)
{
	return __pte(pte_val(pte) & ~(_PAGE_ACCESSED));
}

static inline pte_t pte_mkspecial(pte_t pte)
{
	return __pte(pte_val(pte) | _PAGE_SPECIAL);
}

static inline pte_t pte_mkhuge(pte_t pte)
{
	return pte;
}

#ifdef CONFIG_RISCV_ISA_SVNAPOT
#define pte_leaf_size(pte)	(pte_napot(pte) ?				\
					napot_cont_size(napot_cont_order(pte)) :\
					PAGE_SIZE)
#endif

#ifdef CONFIG_NUMA_BALANCING
/*
 * See the comment in include/asm-generic/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
	return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) == _PAGE_PROT_NONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
	return pte_protnone(pmd_pte(pmd));
}
#endif

/* Modify page protection bits */
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
	unsigned long newprot_val = pgprot_val(newprot);

	ALT_THEAD_PMA(newprot_val);

	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | newprot_val);
}

#define pgd_ERROR(e) \
	pr_err("%s:%d: bad pgd " PTE_FMT ".\n", __FILE__, __LINE__, pgd_val(e))


/* Commit new configuration to MMU hardware */
static inline void update_mmu_cache_range(struct vm_fault *vmf,
		struct vm_area_struct *vma, unsigned long address,
		pte_t *ptep, unsigned int nr)
{
	/*
	 * The kernel assumes that TLBs don't cache invalid entries, but
	 * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
	 * cache flush; it is necessary even after writing invalid entries.
	 * Relying on flush_tlb_fix_spurious_fault would suffice, but
	 * the extra traps reduce performance.  So, eagerly SFENCE.VMA.
	 */
	while (nr--)
		local_flush_tlb_page(address + nr * PAGE_SIZE);
}
#define update_mmu_cache(vma, addr, ptep) \
	update_mmu_cache_range(NULL, vma, addr, ptep, 1)

#define __HAVE_ARCH_UPDATE_MMU_TLB
#define update_mmu_tlb update_mmu_cache

static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
		unsigned long address, pmd_t *pmdp)
{
	pte_t *ptep = (pte_t *)pmdp;

	update_mmu_cache(vma, address, ptep);
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
	return pte_val(pte_a) == pte_val(pte_b);
}

/*
 * Certain architectures need to do special things when PTEs within
 * a page table are directly modified.  Thus, the following hook is
 * made available.
 */
static inline void set_pte(pte_t *ptep, pte_t pteval)
{
	WRITE_ONCE(*ptep, pteval);
}

void flush_icache_pte(struct mm_struct *mm, pte_t pte);

static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
{
	if (pte_present(pteval) && pte_exec(pteval))
		flush_icache_pte(mm, pteval);

	set_pte(ptep, pteval);
}

#define PFN_PTE_SHIFT		_PAGE_PFN_SHIFT

static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
		pte_t *ptep, pte_t pteval, unsigned int nr)
{
	page_table_check_ptes_set(mm, ptep, pteval, nr);

	for (;;) {
		__set_pte_at(mm, ptep, pteval);
		if (--nr == 0)
			break;
		ptep++;
		pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
	}
}
#define set_ptes set_ptes

static inline void pte_clear(struct mm_struct *mm,
	unsigned long addr, pte_t *ptep)
{
	__set_pte_at(mm, ptep, __pte(0));
}

#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS	/* defined in mm/pgtable.c */
extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
				 pte_t *ptep, pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG	/* defined in mm/pgtable.c */
extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
				     pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
				       unsigned long address, pte_t *ptep)
{
	pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));

	page_table_check_pte_clear(mm, pte);

	return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
				      unsigned long address, pte_t *ptep)
{
	atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
}

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
					 unsigned long address, pte_t *ptep)
{
	/*
	 * This comment is borrowed from x86, but applies equally to RISC-V:
	 *
	 * Clearing the accessed bit without a TLB flush
	 * doesn't cause data corruption. [ It could cause incorrect
	 * page aging and the (mistaken) reclaim of hot pages, but the
	 * chance of that should be relatively low. ]
	 *
	 * So as a performance optimization don't flush the TLB when
	 * clearing the accessed bit, it will eventually be flushed by
	 * a context switch or a VM operation anyway. [ In the rare
	 * event of it not getting flushed for a long time the delay
	 * shouldn't really matter because there's no real memory
	 * pressure for swapout to react to. ]
	 */
	return ptep_test_and_clear_young(vma, address, ptep);
}

#define pgprot_noncached pgprot_noncached
static inline pgprot_t pgprot_noncached(pgprot_t _prot)
{
	unsigned long prot = pgprot_val(_prot);

	prot &= ~_PAGE_MTMASK;
	prot |= _PAGE_IO;

	return __pgprot(prot);
}

#define pgprot_writecombine pgprot_writecombine
static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
{
	unsigned long prot = pgprot_val(_prot);

	prot &= ~_PAGE_MTMASK;
	prot |= _PAGE_NOCACHE;

	return __pgprot(prot);
}

/*
 * THP functions
 */
static inline pmd_t pte_pmd(pte_t pte)
{
	return __pmd(pte_val(pte));
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
	return pmd;
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
	return __pmd(pmd_val(pmd) & ~(_PAGE_PRESENT|_PAGE_PROT_NONE));
}

#define __pmd_to_phys(pmd)  (__page_val_to_pfn(pmd_val(pmd)) << PAGE_SHIFT)

static inline unsigned long pmd_pfn(pmd_t pmd)
{
	return ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT);
}

#define __pud_to_phys(pud)  (__page_val_to_pfn(pud_val(pud)) << PAGE_SHIFT)

static inline unsigned long pud_pfn(pud_t pud)
{
	return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT);
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
	return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
}

#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
	return pte_write(pmd_pte(pmd));
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
	return pte_write(pud_pte(pud));
}

#define pmd_dirty pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
	return pte_dirty(pmd_pte(pmd));
}

#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
	return pte_young(pmd_pte(pmd));
}

static inline int pmd_user(pmd_t pmd)
{
	return pte_user(pmd_pte(pmd));
}

static inline pmd_t pmd_mkold(pmd_t pmd)
{
	return pte_pmd(pte_mkold(pmd_pte(pmd)));
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
	return pte_pmd(pte_mkyoung(pmd_pte(pmd)));
}

static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
	return pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)));
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
	return pte_pmd(pte_wrprotect(pmd_pte(pmd)));
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
	return pte_pmd(pte_mkclean(pmd_pte(pmd)));
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
	return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
				pmd_t *pmdp, pmd_t pmd)
{
	page_table_check_pmd_set(mm, pmdp, pmd);
	return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd));
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
				pud_t *pudp, pud_t pud)
{
	page_table_check_pud_set(mm, pudp, pud);
	return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud));
}

#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
	return pte_present(pte) && pte_user(pte);
}

static inline bool pmd_user_accessible_page(pmd_t pmd)
{
	return pmd_leaf(pmd) && pmd_user(pmd);
}

static inline bool pud_user_accessible_page(pud_t pud)
{
	return pud_leaf(pud) && pud_user(pud);
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
	return pmd_leaf(pmd);
}

#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
					unsigned long address, pmd_t *pmdp,
					pmd_t entry, int dirty)
{
	return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
}

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
					unsigned long address, pmd_t *pmdp)
{
	return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
}

#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
					unsigned long address, pmd_t *pmdp)
{
	pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0));

	page_table_check_pmd_clear(mm, pmd);

	return pmd;
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
					unsigned long address, pmd_t *pmdp)
{
	ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
}

#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
				unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
	page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
	return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd)));
}

#define pmdp_collapse_flush pmdp_collapse_flush
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
				 unsigned long address, pmd_t *pmdp);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

/*
 * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
 * are !pte_none() && !pte_present().
 *
 * Format of swap PTE:
 *	bit            0:	_PAGE_PRESENT (zero)
 *	bit       1 to 3:       _PAGE_LEAF (zero)
 *	bit            5:	_PAGE_PROT_NONE (zero)
 *	bit            6:	exclusive marker
 *	bits      7 to 11:	swap type
 *	bits 12 to XLEN-1:	swap offset
 */
#define __SWP_TYPE_SHIFT	7
#define __SWP_TYPE_BITS		5
#define __SWP_TYPE_MASK		((1UL << __SWP_TYPE_BITS) - 1)
#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)

#define MAX_SWAPFILES_CHECK()	\
	BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)

#define __swp_type(x)	(((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
#define __swp_offset(x)	((x).val >> __SWP_OFFSET_SHIFT)
#define __swp_entry(type, offset) ((swp_entry_t) \
	{ (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \
	  ((offset) << __SWP_OFFSET_SHIFT) })

#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x)	((pte_t) { (x).val })

static inline int pte_swp_exclusive(pte_t pte)
{
	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
}

static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
	return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE);
}

static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
	return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) })
#define __swp_entry_to_pmd(swp) __pmd((swp).val)
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */

/*
 * In the RV64 Linux scheme, we give the user half of the virtual-address space
 * and give the kernel the other (upper) half.
 */
#ifdef CONFIG_64BIT
#define KERN_VIRT_START	(-(BIT(VA_BITS)) + TASK_SIZE)
#else
#define KERN_VIRT_START	FIXADDR_START
#endif

/*
 * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
 * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
 * Task size is:
 * -        0x9fc00000	(~2.5GB) for RV32.
 * -      0x4000000000	( 256GB) for RV64 using SV39 mmu
 * -    0x800000000000	( 128TB) for RV64 using SV48 mmu
 * - 0x100000000000000	(  64PB) for RV64 using SV57 mmu
 *
 * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V
 * Instruction Set Manual Volume II: Privileged Architecture" states that
 * "load and store effective addresses, which are 64bits, must have bits
 * 63–48 all equal to bit 47, or else a page-fault exception will occur."
 * Similarly for SV57, bits 63–57 must be equal to bit 56.
 */
#ifdef CONFIG_64BIT
#define TASK_SIZE_64	(PGDIR_SIZE * PTRS_PER_PGD / 2)
#define TASK_SIZE_MIN	(PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)

#ifdef CONFIG_COMPAT
#define TASK_SIZE_32	(_AC(0x80000000, UL) - PAGE_SIZE)
#define TASK_SIZE	(is_compat_task() ? \
			 TASK_SIZE_32 : TASK_SIZE_64)
#else
#define TASK_SIZE	TASK_SIZE_64
#endif

#else
#define TASK_SIZE	FIXADDR_START
#define TASK_SIZE_MIN	TASK_SIZE
#endif

#else /* CONFIG_MMU */

#define PAGE_SHARED		__pgprot(0)
#define PAGE_KERNEL		__pgprot(0)
#define swapper_pg_dir		NULL
#define TASK_SIZE		0xffffffffUL
#define VMALLOC_START		_AC(0, UL)
#define VMALLOC_END		TASK_SIZE

#endif /* !CONFIG_MMU */

extern char _start[];
extern void *_dtb_early_va;
extern uintptr_t _dtb_early_pa;
#if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_MMU)
#define dtb_early_va	(*(void **)XIP_FIXUP(&_dtb_early_va))
#define dtb_early_pa	(*(uintptr_t *)XIP_FIXUP(&_dtb_early_pa))
#else
#define dtb_early_va	_dtb_early_va
#define dtb_early_pa	_dtb_early_pa
#endif /* CONFIG_XIP_KERNEL */
extern u64 satp_mode;

void paging_init(void);
void misc_mem_init(void);

/*
 * ZERO_PAGE is a global shared page that is always zero,
 * used for zero-mapped memory areas, etc.
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))

#endif /* !__ASSEMBLY__ */

#endif /* _ASM_RISCV_PGTABLE_H */
-												treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 286

Based on 1 normalized pattern(s):

  this program is free software you can redistribute it and or modify
  it under the terms of the gnu general public license as published by
  the free software foundation version 2 this program is distributed
  in the hope that it will be useful but without any warranty without
  even the implied warranty of merchantability or fitness for a
  particular purpose see the gnu general public license for more
  details

extracted by the scancode license scanner the SPDX license identifier

  GPL-2.0-only

has been chosen to replace the boilerplate/reference in 97 file(s).

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Allison Randal <allison@lohutok.net>
Reviewed-by: Alexios Zavras <alexios.zavras@intel.com>
Cc: linux-spdx@vger.kernel.org
Link: https://lkml.kernel.org/r/20190529141901.025053186@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

											
										
										
											2019-05-29 14:18:00 +00:00
+								/* SPDX-License-Identifier: GPL-2.0-only */
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								/*
 								 * Copyright (C) 2012 Regents of the University of California
 								 */
 								#ifndef _ASM_RISCV_PGTABLE_H
 								#define _ASM_RISCV_PGTABLE_H
 								#include <linux/mmzone.h>
-												RISC-V: Add PCIe I/O BAR memory mapping

For legacy I/O BARs (non-MMIO BARs) to work correctly on RISC-V Linux,
we need to establish a reserved memory region for them, so that drivers
that wish to use the legacy I/O BARs can issue reads and writes against
a memory region that is mapped to the host PCIe controller's I/O BAR
mapping.

Signed-off-by: Yash Shah <yash.shah@sifive.com>
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-25 08:30:03 +00:00
+								#include <linux/sizes.h>
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
 								#include <asm/pgtable-bits.h>
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#ifndef CONFIG_MMU
 								#define KERNEL_LINK_ADDR	PAGE_OFFSET
-												riscv: Fix is_linear_mapping with recent move of KASAN region

The KASAN region was recently moved between the linear mapping and the
kernel mapping, is_linear_mapping used to check the validity of an
address by using the start of the kernel mapping, which is now wrong.

Fix this by using the maximum size of the physical memory.

Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping")
Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Cc: stable@vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-02-25 12:39:48 +00:00
+								#define KERN_VIRT_SIZE		(UL(-1))
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#else
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#define ADDRESS_SPACE_END	(UL(-1))
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#ifdef CONFIG_64BIT
 								/* Leave 2GB for kernel and BPF at the end of the address space */
 								#define KERNEL_LINK_ADDR	(ADDRESS_SPACE_END - SZ_2G + 1)
 								#else
 								#define KERNEL_LINK_ADDR	PAGE_OFFSET
 								#endif
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
-												riscv: Move KASAN mapping next to the kernel mapping

Now that KASAN_SHADOW_OFFSET is defined at compile time as a config,
this value must remain constant whatever the size of the virtual address
space, which is only possible by pushing this region at the end of the
address space next to the kernel mapping.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:45 +00:00
+								/* Number of entries in the page global directory */
 								#define PTRS_PER_PGD    (PAGE_SIZE / sizeof(pgd_t))
 								/* Number of entries in the page table */
 								#define PTRS_PER_PTE    (PAGE_SIZE / sizeof(pte_t))
 								/*
-												riscv: pgtable: Fixup comment for KERN_VIRT_SIZE

KERN_VIRT_SIZE is 1/4 of the entries of the page global directory,
not half.

Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping")
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Cc: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20230110080419.931185-1-guoren@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-01-10 08:04:19 +00:00
+								 * Half of the kernel address space (1/4 of the entries of the page global
-												riscv: Move KASAN mapping next to the kernel mapping

Now that KASAN_SHADOW_OFFSET is defined at compile time as a config,
this value must remain constant whatever the size of the virtual address
space, which is only possible by pushing this region at the end of the
address space next to the kernel mapping.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:45 +00:00
+								 * directory) is for the direct mapping.
 								 */
 								#define KERN_VIRT_SIZE          ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2)
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
-												riscv: Make vmalloc/vmemmap end equal to the start of the next region

We used to define VMALLOC_END equal to the start of the next region
*minus one* which is inconsistent with the use of this define in the
core code (for example, see the definitions of VMALLOC_TOTAL and
is_vmalloc_addr).

And then make the definition of VMEMMAP_END consistent with VMALLOC_END
and all other regions actually.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Reviewed-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-11-18 13:45:39 +00:00
+								#define VMALLOC_END      PAGE_OFFSET
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
 								#define BPF_JIT_REGION_SIZE	(SZ_128M)
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#ifdef CONFIG_64BIT
-												riscv: Ensure BPF_JIT_REGION_START aligned with PMD size

Andreas reported commit fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
breaks booting with one kind of defconfig, I reproduced a kernel panic
with the defconfig:

[    0.138553] Unable to handle kernel paging request at virtual address ffffffff81201220
[    0.139159] Oops [#1]
[    0.139303] Modules linked in:
[    0.139601] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-rc5-default+ #1
[    0.139934] Hardware name: riscv-virtio,qemu (DT)
[    0.140193] epc : __memset+0xc4/0xfc
[    0.140416]  ra : skb_flow_dissector_init+0x1e/0x82
[    0.140609] epc : ffffffff8029806c ra : ffffffff8033be78 sp : ffffffe001647da0
[    0.140878]  gp : ffffffff81134b08 tp : ffffffe001654380 t0 : ffffffff81201158
[    0.141156]  t1 : 0000000000000002 t2 : 0000000000000154 s0 : ffffffe001647dd0
[    0.141424]  s1 : ffffffff80a43250 a0 : ffffffff81201220 a1 : 0000000000000000
[    0.141654]  a2 : 000000000000003c a3 : ffffffff81201258 a4 : 0000000000000064
[    0.141893]  a5 : ffffffff8029806c a6 : 0000000000000040 a7 : ffffffffffffffff
[    0.142126]  s2 : ffffffff81201220 s3 : 0000000000000009 s4 : ffffffff81135088
[    0.142353]  s5 : ffffffff81135038 s6 : ffffffff8080ce80 s7 : ffffffff80800438
[    0.142584]  s8 : ffffffff80bc6578 s9 : 0000000000000008 s10: ffffffff806000ac
[    0.142810]  s11: 0000000000000000 t3 : fffffffffffffffc t4 : 0000000000000000
[    0.143042]  t5 : 0000000000000155 t6 : 00000000000003ff
[    0.143220] status: 0000000000000120 badaddr: ffffffff81201220 cause: 000000000000000f
[    0.143560] [<ffffffff8029806c>] __memset+0xc4/0xfc
[    0.143859] [<ffffffff8061e984>] init_default_flow_dissectors+0x22/0x60
[    0.144092] [<ffffffff800010fc>] do_one_initcall+0x3e/0x168
[    0.144278] [<ffffffff80600df0>] kernel_init_freeable+0x1c8/0x224
[    0.144479] [<ffffffff804868a8>] kernel_init+0x12/0x110
[    0.144658] [<ffffffff800022de>] ret_from_exception+0x0/0xc
[    0.145124] ---[ end trace f1e9643daa46d591 ]---

After some investigation, I think I found the root cause: commit
2bfc6cd81bd ("move kernel mapping outside of linear mapping") moves
BPF JIT region after the kernel:

| #define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)

The &_end is unlikely aligned with PMD size, so the front bpf jit
region sits with part of kernel .data section in one PMD size mapping.
But kernel is mapped in PMD SIZE, when bpf_jit_binary_lock_ro() is
called to make the first bpf jit prog ROX, we will make part of kernel
.data section RO too, so when we write to, for example memset the
.data section, MMU will trigger a store page fault.

To fix the issue, we need to ensure the BPF JIT region is PMD size
aligned. This patch acchieve this goal by restoring the BPF JIT region
to original position, I.E the 128MB before kernel .text section. The
modification to kasan_init.c is inspired by Alexandre.

Fixes: fc8504765ec5 ("riscv: bpf: Avoid breaking W^X")
Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-06-18 14:09:13 +00:00
+								#define BPF_JIT_REGION_START	(BPF_JIT_REGION_END - BPF_JIT_REGION_SIZE)
 								#define BPF_JIT_REGION_END	(MODULES_END)
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#else
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)
 								#define BPF_JIT_REGION_END	(VMALLOC_END)
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#endif
 								/* Modules always live before the kernel */
 								#ifdef CONFIG_64BIT
-												riscv: Move KASAN mapping next to the kernel mapping

Now that KASAN_SHADOW_OFFSET is defined at compile time as a config,
this value must remain constant whatever the size of the virtual address
space, which is only possible by pushing this region at the end of the
address space next to the kernel mapping.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:45 +00:00
+								/* This is used to define the end of the KASAN shadow region */
 								#define MODULES_LOWEST_VADDR	(KERNEL_LINK_ADDR - SZ_2G)
 								#define MODULES_VADDR		(PFN_ALIGN((unsigned long)&_end) - SZ_2G)
 								#define MODULES_END		(PFN_ALIGN((unsigned long)&_start))
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#endif
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
 								/*
 								 * Roughly size the vmemmap space to be large enough to fit enough
 								 * struct pages to map half the virtual address space. Then
 								 * position vmemmap directly below the VMALLOC region.
 								 */
-												RISC-V: mm: Restrict address space for sv39,sv48,sv57

Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:01 +00:00
+								#define VA_BITS_SV32 32
-												riscv: Allow to dynamically define VA_BITS

With 4-level page table folding at runtime, we don't know at compile time
the size of the virtual address space so we must set VA_BITS dynamically
so that sparsemem reserves the right amount of memory for struct pages.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:48 +00:00
+								#ifdef CONFIG_64BIT
-												RISC-V: mm: Restrict address space for sv39,sv48,sv57

Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:01 +00:00
+								#define VA_BITS_SV39 39
 								#define VA_BITS_SV48 48
 								#define VA_BITS_SV57 57
-												riscv: mm: Control p4d's folding by pgtable_l5_enabled

To determine pgtable level at boot time, we can not use helper functions
in include/asm-generic/pgtable-nop4d.h and must implement these
functions. This patch uses pgtable_l5_enabled variable instead of
including pgtable-nop4d.h to controle p4d's folding, and implements
corresponding helper functions.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-01-27 02:48:41 +00:00
+								#define VA_BITS		(pgtable_l5_enabled ? \
-												RISC-V: mm: Restrict address space for sv39,sv48,sv57

Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:01 +00:00
+												VA_BITS_SV57 : (pgtable_l4_enabled ? VA_BITS_SV48 : VA_BITS_SV39))
-												riscv: Allow to dynamically define VA_BITS

With 4-level page table folding at runtime, we don't know at compile time
the size of the virtual address space so we must set VA_BITS dynamically
so that sparsemem reserves the right amount of memory for struct pages.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:48 +00:00
+								#else
-												RISC-V: mm: Restrict address space for sv39,sv48,sv57

Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:01 +00:00
+								#define VA_BITS		VA_BITS_SV32
-												riscv: Allow to dynamically define VA_BITS

With 4-level page table folding at runtime, we don't know at compile time
the size of the virtual address space so we must set VA_BITS dynamically
so that sparsemem reserves the right amount of memory for struct pages.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:48 +00:00
+								#endif
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#define VMEMMAP_SHIFT \
-												riscv: Allow to dynamically define VA_BITS

With 4-level page table folding at runtime, we don't know at compile time
the size of the virtual address space so we must set VA_BITS dynamically
so that sparsemem reserves the right amount of memory for struct pages.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:48 +00:00
+									(VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#define VMEMMAP_SIZE	BIT(VMEMMAP_SHIFT)
-												riscv: Make vmalloc/vmemmap end equal to the start of the next region

We used to define VMALLOC_END equal to the start of the next region
*minus one* which is inconsistent with the use of this define in the
core code (for example, see the definitions of VMALLOC_TOTAL and
is_vmalloc_addr).

And then make the definition of VMEMMAP_END consistent with VMALLOC_END
and all other regions actually.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Reviewed-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-11-18 13:45:39 +00:00
+								#define VMEMMAP_END	VMALLOC_START
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#define VMEMMAP_START	(VMALLOC_START - VMEMMAP_SIZE)
 								/*
 								 * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel
 								 * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled.
 								 */
-												riscv: Sparse-Memory/vmemmap out-of-bounds fix

Offset vmemmap so that the first page of vmemmap will be mapped
to the first page of physical memory in order to ensure that
vmemmap’s bounds will be respected during
pfn_to_page()/page_to_pfn() operations.
The conversion macros will produce correct SV39/48/57 addresses
for every possible/valid DRAM_BASE inside the physical memory limits.

v2:Address Alex's comments

Suggested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Dimitris Vlachos <dvlachos@ics.forth.gr>
Reported-by: Dimitris Vlachos <dvlachos@ics.forth.gr>
Closes: https://lore.kernel.org/linux-riscv/20240202135030.42265-1-csd4492@csd.uoc.gr
Fixes: d95f1a542c3d ("RISC-V: Implement sparsemem")
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20240229191723.32779-1-dvlachos@ics.forth.gr
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-29 19:17:23 +00:00
+								#define vmemmap		((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT))
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
 								#define PCI_IO_SIZE      SZ_16M
 								#define PCI_IO_END       VMEMMAP_START
 								#define PCI_IO_START     (PCI_IO_END - PCI_IO_SIZE)
 								#define FIXADDR_TOP      PCI_IO_START
 								#ifdef CONFIG_64BIT
-												riscv: Move early dtb mapping into the fixmap region

riscv establishes 2 virtual mappings:

- early_pg_dir maps the kernel which allows to discover the system
  memory
- swapper_pg_dir installs the final mapping (linear mapping included)

We used to map the dtb in early_pg_dir using DTB_EARLY_BASE_VA, and this
mapping was not carried over in swapper_pg_dir. It happens that
early_init_fdt_scan_reserved_mem() must be called before swapper_pg_dir is
setup otherwise we could allocate reserved memory defined in the dtb.
And this function initializes reserved_mem variable with addresses that
lie in the early_pg_dir dtb mapping: when those addresses are reused
with swapper_pg_dir, this mapping does not exist and then we trap.

The previous "fix" was incorrect as early_init_fdt_scan_reserved_mem()
must be called before swapper_pg_dir is set up otherwise we could
allocate in reserved memory defined in the dtb.

So move the dtb mapping in the fixmap region which is established in
early_pg_dir and handed over to swapper_pg_dir.

Fixes: 922b0375fc93 ("riscv: Fix memblock reservation for device tree blob")
Fixes: 8f3a2b4a96dc ("RISC-V: Move DT mapping outof fixmap")
Fixes: 50e63dd8ed92 ("riscv: fix reserved memory setup")
Reported-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/all/f8e67f82-103d-156c-deb0-d6d6e2756f5e@microchip.com/
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Tested-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20230329081932.79831-2-alexghiti@rivosinc.com
Cc: stable@vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-03-29 08:19:30 +00:00
+								#define MAX_FDT_SIZE	 PMD_SIZE
 								#define FIX_FDT_SIZE	 (MAX_FDT_SIZE + SZ_2M)
 								#define FIXADDR_SIZE     (PMD_SIZE + FIX_FDT_SIZE)
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#else
-												riscv: Move early dtb mapping into the fixmap region

riscv establishes 2 virtual mappings:

- early_pg_dir maps the kernel which allows to discover the system
  memory
- swapper_pg_dir installs the final mapping (linear mapping included)

We used to map the dtb in early_pg_dir using DTB_EARLY_BASE_VA, and this
mapping was not carried over in swapper_pg_dir. It happens that
early_init_fdt_scan_reserved_mem() must be called before swapper_pg_dir is
setup otherwise we could allocate reserved memory defined in the dtb.
And this function initializes reserved_mem variable with addresses that
lie in the early_pg_dir dtb mapping: when those addresses are reused
with swapper_pg_dir, this mapping does not exist and then we trap.

The previous "fix" was incorrect as early_init_fdt_scan_reserved_mem()
must be called before swapper_pg_dir is set up otherwise we could
allocate in reserved memory defined in the dtb.

So move the dtb mapping in the fixmap region which is established in
early_pg_dir and handed over to swapper_pg_dir.

Fixes: 922b0375fc93 ("riscv: Fix memblock reservation for device tree blob")
Fixes: 8f3a2b4a96dc ("RISC-V: Move DT mapping outof fixmap")
Fixes: 50e63dd8ed92 ("riscv: fix reserved memory setup")
Reported-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/all/f8e67f82-103d-156c-deb0-d6d6e2756f5e@microchip.com/
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Tested-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20230329081932.79831-2-alexghiti@rivosinc.com
Cc: stable@vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-03-29 08:19:30 +00:00
+								#define MAX_FDT_SIZE	 PGDIR_SIZE
 								#define FIX_FDT_SIZE	 MAX_FDT_SIZE
 								#define FIXADDR_SIZE     (PGDIR_SIZE + FIX_FDT_SIZE)
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#endif
 								#define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
-												RISC-V: enable XIP

Introduce XIP (eXecute In Place) support for RISC-V platforms.
It allows code to be executed directly from non-volatile storage
directly addressable by the CPU, such as QSPI NOR flash which can
be found on many RISC-V platforms. This makes way for significant
optimization of RAM footprint. The XIP kernel is not compressed
since it has to run directly from flash, so it will occupy more
space on the non-volatile storage. The physical flash address used
to link the kernel object files and for storing it has to be known
at compile time and is represented by a Kconfig option.

XIP on RISC-V will for the time being only work on MMU-enabled
kernels.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
[Alex: Rebase on top of "Move kernel mapping outside the linear mapping" ]
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: disable XIP for allyesconfig]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-13 06:35:14 +00:00
-												RISC-V: Always define XIP_FIXUP

XIP depends on MMU, but XIP_FIXUP is used throughout the kernel in
order to avoid excessive ifdefs.  This just makes sure to always define
XIP_FIXUP, which will fix MMU=n builds.  XIP_OFFSET is used by assembly
but XIP_FIXUP is C-only, so they're split.

Fixes: 44c922572952 ("RISC-V: enable XIP")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
Tested-by: Alexandre Ghiti <alex@ghiti.fr>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-28 21:45:12 +00:00
+								#endif
-												RISC-V: enable XIP

Introduce XIP (eXecute In Place) support for RISC-V platforms.
It allows code to be executed directly from non-volatile storage
directly addressable by the CPU, such as QSPI NOR flash which can
be found on many RISC-V platforms. This makes way for significant
optimization of RAM footprint. The XIP kernel is not compressed
since it has to run directly from flash, so it will occupy more
space on the non-volatile storage. The physical flash address used
to link the kernel object files and for storing it has to be known
at compile time and is represented by a Kconfig option.

XIP on RISC-V will for the time being only work on MMU-enabled
kernels.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
[Alex: Rebase on top of "Move kernel mapping outside the linear mapping" ]
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: disable XIP for allyesconfig]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-13 06:35:14 +00:00
+								#ifdef CONFIG_XIP_KERNEL
-												riscv: remove .text section size limitation for XIP

Currently there's a limit of 8MB for the .text section of a RISC-V
image in the XIP case. This breaks compilation of many automatic
builds and is generally inconvenient. This patch removes that
limitation and optimizes XIP image file size at the same time.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-10-11 09:14:14 +00:00
+								#define XIP_OFFSET		SZ_32M
 								#define XIP_OFFSET_MASK		(SZ_32M - 1)
-												riscv: Simplify xip and !xip kernel address conversion macros

To simplify the kernel address conversion code, make the same definition of
kernel_mapping_pa_to_va and kernel_mapping_va_to_pa compatible for both xip
and !xip kernel by defining XIP_OFFSET to 0 in !xip kernel.

Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-06-04 11:49:48 +00:00
+								#else
 								#define XIP_OFFSET		0
-												RISC-V: Move all address space definition macros to one place

If both CONFIG_KASAN and CONFIG_SPARSEMEM_VMEMMAP are set, we get the
following compilation error.

---------------------------------------------------------------
./arch/riscv/include/asm/pgtable-64.h: In function ‘pud_page’:
./include/asm-generic/memory_model.h:54:29: error: ‘vmemmap’ undeclared
(first use in this function); did you mean ‘mem_map’?
 #define __pfn_to_page(pfn) (vmemmap + (pfn))
                             ^~~~~~~
./include/asm-generic/memory_model.h:82:21: note: in expansion of
macro ‘__pfn_to_page’

 #define pfn_to_page __pfn_to_page
                     ^~~~~~~~~~~~~
./arch/riscv/include/asm/pgtable-64.h:70:9: note: in expansion of macro
‘pfn_to_page’
  return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
---------------------------------------------------------------

Fix the compliation errors by moving all the address space definition
macros before including pgtable-64.h.

Fixes: 8ad8b72721d0 (riscv: Add KASAN support)

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-02-24 19:34:36 +00:00
+								#endif
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								#ifndef __ASSEMBLY__
 								#include <asm/page.h>
 								#include <asm/tlbflush.h>
 								#include <linux/mm_types.h>
-												RISC-V: mm: Restrict address space for sv39,sv48,sv57

Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:01 +00:00
+								#include <asm/compat.h>
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
-												riscv: Fix accessing pfn bits in PTEs for non-32bit variants

On rv32 the PFN part of PTEs is defined to use bits [xlen-1:10]
while on rv64 it is defined to use bits [53:10], leaving [63:54]
as reserved.

With upcoming optional extensions like svpbmt these previously
reserved bits will get used so simply right-shifting the PTE
to get the PFN won't be enough.

So introduce a _PAGE_PFN_MASK constant to mask the correct bits
for both rv32 and rv64 before shifting.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Link: https://lore.kernel.org/r/20220511192921.2223629-9-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:17 +00:00
+								#define __page_val_to_pfn(_val)  (((_val) & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#ifdef CONFIG_64BIT
 								#include <asm/pgtable-64.h>
-												RISC-V: mm: Restrict address space for sv39,sv48,sv57

Make sv48 the default address space for mmap as some applications
currently depend on this assumption. A hint address passed to mmap will
cause the largest address space that fits entirely into the hint to be
used. If the hint is less than or equal to 1<<38, an sv39 address will
be used. An exception is that if the hint address is 0, then a sv48
address will be used. After an address space is completely full, the next
smallest address space will be used.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:01 +00:00
 								#define VA_USER_SV39 (UL(1) << (VA_BITS_SV39 - 1))
 								#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
 								#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))
 								#define MMAP_VA_BITS_64 ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
 								#define MMAP_MIN_VA_BITS_64 (VA_BITS_SV39)
 								#define MMAP_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_VA_BITS_64)
 								#define MMAP_MIN_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_MIN_VA_BITS_64)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#else
 								#include <asm/pgtable-32.h>
 								#endif /* CONFIG_64BIT */
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								#include <linux/page_table_check.h>
-												RISC-V: Always define XIP_FIXUP

XIP depends on MMU, but XIP_FIXUP is used throughout the kernel in
order to avoid excessive ifdefs.  This just makes sure to always define
XIP_FIXUP, which will fix MMU=n builds.  XIP_OFFSET is used by assembly
but XIP_FIXUP is C-only, so they're split.

Fixes: 44c922572952 ("RISC-V: enable XIP")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
Tested-by: Alexandre Ghiti <alex@ghiti.fr>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-28 21:45:12 +00:00
+								#ifdef CONFIG_XIP_KERNEL
 								#define XIP_FIXUP(addr) ({							\
 									uintptr_t __a = (uintptr_t)(addr);					\
-												riscv: remove .text section size limitation for XIP

Currently there's a limit of 8MB for the .text section of a RISC-V
image in the XIP case. This breaks compilation of many automatic
builds and is generally inconvenient. This patch removes that
limitation and optimizes XIP image file size at the same time.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-10-11 09:14:14 +00:00
+									(__a >= CONFIG_XIP_PHYS_ADDR && \
 									 __a < CONFIG_XIP_PHYS_ADDR + XIP_OFFSET * 2) ?	\
-												RISC-V: Always define XIP_FIXUP

XIP depends on MMU, but XIP_FIXUP is used throughout the kernel in
order to avoid excessive ifdefs.  This just makes sure to always define
XIP_FIXUP, which will fix MMU=n builds.  XIP_OFFSET is used by assembly
but XIP_FIXUP is C-only, so they're split.

Fixes: 44c922572952 ("RISC-V: enable XIP")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
Tested-by: Alexandre Ghiti <alex@ghiti.fr>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-28 21:45:12 +00:00
+										__a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\
 										__a;								\
 									})
 								#else
 								#define XIP_FIXUP(addr)		(addr)
 								#endif /* CONFIG_XIP_KERNEL */
-												riscv: Implement sv48 support

By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that offers
128TB of virtual address space to userspace and allows up to 64TB of
physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:51 +00:00
+								struct pt_alloc_ops {
 									pte_t *(*get_pte_virt)(phys_addr_t pa);
 									phys_addr_t (*alloc_pte)(uintptr_t va);
 								#ifndef __PAGETABLE_PMD_FOLDED
 									pmd_t *(*get_pmd_virt)(phys_addr_t pa);
 									phys_addr_t (*alloc_pmd)(uintptr_t va);
 									pud_t *(*get_pud_virt)(phys_addr_t pa);
 									phys_addr_t (*alloc_pud)(uintptr_t va);
-												riscv: mm: Prepare pt_ops helper functions for sv57

This patch prepare some pt_ops helper functions which will be used in
creating sv57 mappings during boot time.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-01-27 02:48:42 +00:00
+									p4d_t *(*get_p4d_virt)(phys_addr_t pa);
 									phys_addr_t (*alloc_p4d)(uintptr_t va);
-												riscv: Implement sv48 support

By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that offers
128TB of virtual address space to userspace and allows up to 64TB of
physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:51 +00:00
+								#endif
 								};
-												RISC-V: Introduce sv48 support without relocatable kernel

This patchset allows to have a single kernel for sv39 and sv48 without
being relocatable.

The idea comes from Arnd Bergmann who suggested to do the same as x86,
that is mapping the kernel to the end of the address space, which allows
the kernel to be linked at the same address for both sv39 and sv48 and
then does not require to be relocated at runtime.

This implements sv48 support at runtime. The kernel will try to boot
with 4-level page table and will fallback to 3-level if the HW does not
support it. Folding the 4th level into a 3-level page table has almost
no cost at runtime.

Note that kasan region had to be moved to the end of the address space
since its location must be known at compile-time and then be valid for
both sv39 and sv48 (and sv57 that is coming).

* riscv-sv48-v3:
  riscv: Explicit comment about user virtual address space size
  riscv: Use pgtable_l4_enabled to output mmu_type in cpuinfo
  riscv: Implement sv48 support
  asm-generic: Prepare for riscv use of pud_alloc_one and pud_free
  riscv: Allow to dynamically define VA_BITS
  riscv: Introduce functions to switch pt_ops
  riscv: Split early kasan mapping to prepare sv48 introduction
  riscv: Move KASAN mapping next to the kernel mapping
  riscv: Get rid of MAXPHYSMEM configs

Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-01-20 03:23:41 +00:00
+								extern struct pt_alloc_ops pt_ops __initdata;
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
-												riscv: add nommu support

The kernel runs in M-mode without using page tables, and thus can't run
bare metal without help from additional firmware.

Most of the patch is just stubbing out code not needed without page
tables, but there is an interesting detail in the signals implementation:

 - The normal RISC-V syscall ABI only implements rt_sigreturn as VDSO
   entry point, but the ELF VDSO is not supported for nommu Linux.
   We instead copy the code to call the syscall onto the stack.

In addition to enabling the nommu code a new defconfig for a small
kernel image that can run in nommu mode on qemu is also provided, to run
a kernel in qemu you can use the following command line:

qemu-system-riscv64 -smp 2 -m 64 -machine virt -nographic \
	-kernel arch/riscv/boot/loader \
	-drive file=rootfs.ext2,format=raw,id=hd0 \
	-device virtio-blk-device,drive=hd0

Contains contributions from Damien Le Moal <Damien.LeMoal@wdc.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anup Patel <anup@brainfault.org>
[paul.walmsley@sifive.com: updated to apply; add CONFIG_MMU guards
 around PCI_IOBASE definition to fix build issues; fixed checkpatch
 issues; move the PCI_IO_* and VMEMMAP address space macros along
 with the others; resolve sparse warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-28 12:10:41 +00:00
+								#ifdef CONFIG_MMU
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								/* Number of PGD entries that a user-mode program can use */
 								#define USER_PTRS_PER_PGD   (TASK_SIZE / PGDIR_SIZE)
 								/* Page protection bits */
 								#define _PAGE_BASE	(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER)
-												riscv/mm: Adjust PAGE_PROT_NONE to comply with THP semantics

This is a preparation for enabling THP migration.
As the commit b65399f6111b("arm64/mm: Change THP helpers
to comply with generic MM semantics") mentioned, pmd_present()
and pmd_trans_huge() are expected to behave in the following
manner:
-------------------------------------------------------------------------
|	PMD states	|	pmd_present	|	pmd_trans_huge	|
-------------------------------------------------------------------------
|	Mapped		|	Yes		|	Yes		|
-------------------------------------------------------------------------
|	Splitting	|	Yes		|	Yes		|
-------------------------------------------------------------------------
|	Migration/Swap	|	No		|	No		|
-------------------------------------------------------------------------

At present the PROT_NONE bit reuses the READ bit could not comply with
above semantics with two problems:
1. When splitting a PMD THP, PMD is first invalidated with
pmdp_invalidate()->pmd_mkinvalid(), which clears the PRESENT bit
and PROT_NONE bit/READ bit, if the PMD is read-only, then the PAGE_LEAF
property is also cleared, which results in pmd_present() return false.
2. When migrating, the swap entry only clear the PRESENT bit
and PROT_NONE bit/READ bit, the W/X bit may be set, so _PAGE_LEAF may be
true which results in pmd_present() return true.

Solution:
Adjust PROT_NONE bit from READ to GLOBAL bit can satisfy the above rules:
1. GLOBAL bit has no other meanings, not like the R/W/X bit, which is
also relative with _PAGE_LEAF property.
2. GLOBAL bit is at bit 5, making swap entry start from bit 6, bit 0-5
are zero, which means the PRESENT, PROT_NONE, and PAGE_LEAF are
all false, then the pmd_present() and pmd_trans_huge() return false when
in migration/swap.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-11-23 14:06:37 +00:00
+								#define PAGE_NONE		__pgprot(_PAGE_PROT_NONE | _PAGE_READ)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#define PAGE_READ		__pgprot(_PAGE_BASE | _PAGE_READ)
 								#define PAGE_WRITE		__pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_WRITE)
 								#define PAGE_EXEC		__pgprot(_PAGE_BASE | _PAGE_EXEC)
 								#define PAGE_READ_EXEC		__pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
 								#define PAGE_WRITE_EXEC		__pgprot(_PAGE_BASE | _PAGE_READ |	\
 													 _PAGE_EXEC | _PAGE_WRITE)
 								#define PAGE_COPY		PAGE_READ
-												riscv: mm: Ensure prot of VM_WRITE and VM_EXEC must be readable

Commit 8aeb7b17f04e ("RISC-V: Make mmap() with PROT_WRITE imply PROT_READ")
allows riscv to use mmap with PROT_WRITE only, and meanwhile mmap with w+x
is also permitted. However, when userspace tries to access this page with
PROT_WRITE|PROT_EXEC, which causes infinite loop at load page fault as
well as it triggers soft lockup. According to riscv privileged spec,
"Writable pages must also be marked readable". The fix to drop the
`PAGE_COPY_READ_EXEC` and then `PAGE_COPY_EXEC` would be just used instead.
This aligns the other arches (i.e arm64) for protection_map.

Fixes: 8aeb7b17f04e ("RISC-V: Make mmap() with PROT_WRITE imply PROT_READ")
Signed-off-by: Hsieh-Tseng Shen <woodrow.shen@sifive.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20230425102828.1616812-1-woodrow.shen@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-04-25 10:28:28 +00:00
+								#define PAGE_COPY_EXEC		PAGE_READ_EXEC
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#define PAGE_SHARED		PAGE_WRITE
 								#define PAGE_SHARED_EXEC	PAGE_WRITE_EXEC
 								#define _PAGE_KERNEL		(_PAGE_READ \
 												| _PAGE_WRITE \
 												| _PAGE_PRESENT \
 												| _PAGE_ACCESSED \
-												riscv: Use global mappings for kernel pages

We map kernel pages into all addresses spages, so they can be marked as
global.  This allows hardware to avoid flushing the kernel mappings when
moving between address spaces.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[Palmer: commit text]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-05-26 05:49:20 +00:00
+												| _PAGE_DIRTY \
 												| _PAGE_GLOBAL)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
 								#define PAGE_KERNEL		__pgprot(_PAGE_KERNEL)
-												RISC-V: Add EFI runtime services

This patch adds EFI runtime service support for RISC-V.

Signed-off-by: Atish Patra <atish.patra@wdc.com>
[ardb: - Remove the page check]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-09-17 22:37:15 +00:00
+								#define PAGE_KERNEL_READ	__pgprot(_PAGE_KERNEL & ~_PAGE_WRITE)
 								#define PAGE_KERNEL_EXEC	__pgprot(_PAGE_KERNEL | _PAGE_EXEC)
 								#define PAGE_KERNEL_READ_EXEC	__pgprot((_PAGE_KERNEL & ~_PAGE_WRITE) \
 													 | _PAGE_EXEC)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
-												RISC-V: Setup initial page tables in two stages

Currently, the setup_vm() does initial page table setup in one-shot
very early before enabling MMU. Due to this, the setup_vm() has to map
all possible kernel virtual addresses since it does not know size and
location of RAM. This means we have kernel mappings for non-existent
RAM and any buggy driver (or kernel) code doing out-of-bound access
to RAM will not fault and cause underterministic behaviour.

Further, the setup_vm() creates PMD mappings (i.e. 2M mappings) for
RV64 systems. This means for PAGE_OFFSET=0xffffffe000000000 (i.e.
MAXPHYSMEM_128GB=y), the setup_vm() will require 129 pages (i.e.
516 KB) of memory for initial page tables which is never freed. The
memory required for initial page tables will further increase if
we chose a lower value of PAGE_OFFSET (e.g. 0xffffff0000000000)

This patch implements two-staged initial page table setup, as follows:
1. Early (i.e. setup_vm()): This stage maps kernel image and DTB in
a early page table (i.e. early_pg_dir). The early_pg_dir will be used
only by boot HART so it can be freed as-part of init memory free-up.
2. Final (i.e. setup_vm_final()): This stage maps all possible RAM
banks in the final page table (i.e. swapper_pg_dir). The boot HART
will start using swapper_pg_dir at the end of setup_vm_final(). All
non-boot HARTs directly use the swapper_pg_dir created by boot HART.

We have following advantages with this new approach:
1. Kernel mappings for non-existent RAM don't exists anymore.
2. Memory consumed by initial page tables is now indpendent of the
chosen PAGE_OFFSET.
3. Memory consumed by initial page tables on RV64 system is 2 pages
(i.e. 8 KB) which has significantly reduced and these pages will be
freed as-part of the init memory free-up.

The patch also provides a foundation for implementing strict kernel
mappings where we protect kernel text and rodata using PTE permissions.

Suggested-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Anup Patel <anup.patel@wdc.com>
[paul.walmsley@sifive.com: updated to apply; fixed a checkpatch warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-06-28 20:36:21 +00:00
+								#define PAGE_TABLE		__pgprot(_PAGE_TABLE)
-												riscv: add RISC-V Svpbmt extension support

Svpbmt (the S should be capitalized) is the
"Supervisor-mode: page-based memory types" extension
that specifies attributes for cacheability, idempotency
and ordering.

The relevant settings are done in special bits in PTEs:

Here is the svpbmt PTE format:
| 63 | 62-61 | 60-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
  N     MT     RSW    D   A   G   U   X   W   R   V
        ^

Of the Reserved bits [63:54] in a leaf PTE, the high bit is already
allocated (as the N bit), so bits [62:61] are used as the MT (aka
MemType) field. This field specifies one of three memory types that
are close equivalents (or equivalent in effect) to the three main x86
and ARMv8 memory types - as shown in the following table.

RISC-V
Encoding &
MemType     RISC-V Description
----------  ------------------------------------------------
00 - PMA    Normal Cacheable, No change to implied PMA memory type
01 - NC     Non-cacheable, idempotent, weakly-ordered Main Memory
10 - IO     Non-cacheable, non-idempotent, strongly-ordered I/O memory
11 - Rsvd   Reserved for future standard use

As the extension will not be present on all implementations,
implement a method to handle cpufeatures via alternatives
to not incur runtime penalties on cpu variants not supporting
specific extensions and patch relevant code parts at runtime.

Co-developed-by: Wei Fu <wefu@redhat.com>
Signed-off-by: Wei Fu <wefu@redhat.com>
Co-developed-by: Liu Shaohua <liush@allwinnertech.com>
Signed-off-by: Liu Shaohua <liush@allwinnertech.com>
Co-developed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@kernel.org>
[moved to use the alternatives mechanism]
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Link: https://lore.kernel.org/r/20220511192921.2223629-10-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:18 +00:00
+								#define _PAGE_IOREMAP	((_PAGE_KERNEL & ~_PAGE_MTMASK) | _PAGE_IO)
 								#define PAGE_KERNEL_IO		__pgprot(_PAGE_IOREMAP)
-												riscv: use the generic ioremap code

Use the generic ioremap code instead of providing a local version.
Note that this relies on the asm-generic no-op definition of
pgprot_noncached.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Paul Walmsley <paul.walmsley@sifive.com>
Tested-by: Paul Walmsley <paul.walmsley@sifive.com> # rv32, rv64 boot
Acked-by: Paul Walmsley <paul.walmsley@sifive.com> # arch/riscv

											
										
										
											2019-08-13 09:27:56 +00:00
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								extern pgd_t swapper_pg_dir[];
-												riscv: mm: fix 2 instances of -Wmissing-variable-declarations

I'm looking to enable -Wmissing-variable-declarations behind W=1. 0day
bot spotted the following instance in ARCH=riscv builds:

  arch/riscv/mm/init.c:276:7: warning: no previous extern declaration
  for non-static variable 'trampoline_pg_dir'
  [-Wmissing-variable-declarations]
  276 | pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
      |       ^
  arch/riscv/mm/init.c:276:1: note: declare 'static' if the variable is
  not intended to be used outside of this translation unit
  276 | pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
      | ^
  arch/riscv/mm/init.c:279:7: warning: no previous extern declaration
  for non-static variable 'early_pg_dir'
  [-Wmissing-variable-declarations]
  279 | pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
      |       ^
  arch/riscv/mm/init.c:279:1: note: declare 'static' if the variable is
  not intended to be used outside of this translation unit
  279 | pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
      | ^

These symbols are referenced by more than one translation unit, so make
sure they're both declared and include the correct header for their
declarations. Finally, sort the list of includes to help keep them tidy.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/llvm/202308081000.tTL1ElTr-lkp@intel.com/
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20230808-riscv_static-v2-1-2a1e2d2c7a4f@google.com
Cc: stable@vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-08 16:35:00 +00:00
+								extern pgd_t trampoline_pg_dir[];
 								extern pgd_t early_pg_dir[];
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 								static inline int pmd_present(pmd_t pmd)
 								{
 									/*
 									 * Checking for _PAGE_LEAF is needed too because:
 									 * When splitting a THP, split_huge_page() will temporarily clear
 									 * the present bit, in this situation, pmd_present() and
 									 * pmd_trans_huge() still needs to return true.
 									 */
 									return (pmd_val(pmd) & (_PAGE_PRESENT | _PAGE_PROT_NONE | _PAGE_LEAF));
 								}
 								#else
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								static inline int pmd_present(pmd_t pmd)
 								{
-												riscv: Add pte bit to distinguish swap from invalid

Previously, invalid PTEs and swap PTEs had the same binary
representation, causing errors when attempting to unmap PROT_NONE
mappings, including implicit unmap on exit.

Typical error:

swap_info_get: Bad swap file entry 40000000007a9879
BUG: Bad page map in process a.out  pte:3d4c3cc0 pmd:3e521401

Cc: stable@vger.kernel.org
Signed-off-by: Stefan O'Rear <sorear2@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2018-12-16 18:03:36 +00:00
+									return (pmd_val(pmd) & (_PAGE_PRESENT | _PAGE_PROT_NONE));
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								#endif
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
 								static inline int pmd_none(pmd_t pmd)
 								{
 									return (pmd_val(pmd) == 0);
 								}
 								static inline int pmd_bad(pmd_t pmd)
 								{
-												riscv: mm: make pmd_bad() check leaf condition

In the definition in Documentation/vm/arch_pgtable_helpers.rst,
pmd_bad() means test a non-table mapped PMD, so it should also
return true when it is a leaf page.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:48 +00:00
+									return !pmd_present(pmd) || (pmd_val(pmd) & _PAGE_LEAF);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
-												riscv: mm: add p?d_leaf() definitions

walk_page_range() is going to be allowed to walk page tables other than
those of user space.  For this it needs to know when it has reached a
'leaf' entry in the page tables.  This information is provided by the
p?d_leaf() functions/macros.

For riscv a page is a leaf page when it has a read, write or execute bit
set on it.

Link: http://lkml.kernel.org/r/20191218162402.45610-8-steven.price@arm.com
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Alexandre Ghiti <alex@ghiti.fr>
Reviewed-by: Zong Li <zong.li@sifive.com>
Acked-by: Paul Walmsley <paul.walmsley@sifive.com>	[arch/riscv]
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Liang, Kan" <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

											
										
										
											2020-02-04 01:35:28 +00:00
+								#define pmd_leaf	pmd_leaf
-												mm/treewide: align up pXd_leaf() retval across archs

Even if pXd_leaf() API is defined globally, it's not clear on the retval,
and there are three types used (bool, int, unsigned log).

Always return a boolean for pXd_leaf() APIs.

Link: https://lkml.kernel.org/r/20240305043750.93762-11-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2024-03-05 04:37:50 +00:00
+								static inline bool pmd_leaf(pmd_t pmd)
-												riscv: mm: add p?d_leaf() definitions

walk_page_range() is going to be allowed to walk page tables other than
those of user space.  For this it needs to know when it has reached a
'leaf' entry in the page tables.  This information is provided by the
p?d_leaf() functions/macros.

For riscv a page is a leaf page when it has a read, write or execute bit
set on it.

Link: http://lkml.kernel.org/r/20191218162402.45610-8-steven.price@arm.com
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Alexandre Ghiti <alex@ghiti.fr>
Reviewed-by: Zong Li <zong.li@sifive.com>
Acked-by: Paul Walmsley <paul.walmsley@sifive.com>	[arch/riscv]
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Liang, Kan" <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

											
										
										
											2020-02-04 01:35:28 +00:00
+								{
-												riscv: mm: add _PAGE_LEAF macro

In riscv, a page table entry is leaf when any bit of read, write,
or execute bit is set. So add a macro:_PAGE_LEAF instead of
(_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC), which is frequently used
to determine if it is a leaf page. This make code easier to read,
without any functional change.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:47 +00:00
+									return pmd_present(pmd) && (pmd_val(pmd) & _PAGE_LEAF);
-												riscv: mm: add p?d_leaf() definitions

walk_page_range() is going to be allowed to walk page tables other than
those of user space.  For this it needs to know when it has reached a
'leaf' entry in the page tables.  This information is provided by the
p?d_leaf() functions/macros.

For riscv a page is a leaf page when it has a read, write or execute bit
set on it.

Link: http://lkml.kernel.org/r/20191218162402.45610-8-steven.price@arm.com
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Alexandre Ghiti <alex@ghiti.fr>
Reviewed-by: Zong Li <zong.li@sifive.com>
Acked-by: Paul Walmsley <paul.walmsley@sifive.com>	[arch/riscv]
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Liang, Kan" <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

											
										
										
											2020-02-04 01:35:28 +00:00
+								}
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 								{
-												riscv: Use WRITE_ONCE() when setting page table entries

To avoid any compiler "weirdness" when accessing page table entries which
are concurrently modified by the HW, let's use WRITE_ONCE() macro
(commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing
page tables") gives a great explanation with more details).

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20231213203001.179237-2-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-12-13 20:29:58 +00:00
+									WRITE_ONCE(*pmdp, pmd);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								static inline void pmd_clear(pmd_t *pmdp)
 								{
 									set_pmd(pmdp, __pmd(0));
 								}
 								static inline pgd_t pfn_pgd(unsigned long pfn, pgprot_t prot)
 								{
-												riscv: add memory-type errata for T-Head

Some current cpus based on T-Head cores implement memory-types
way different than described in the svpbmt spec even going
so far as using PTE bits marked as reserved.

Add the T-Head vendor-id and necessary errata code to
replace the affected instructions.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Samuel Holland <samuel@sholland.org>
Link: https://lore.kernel.org/r/20220511192921.2223629-13-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:21 +00:00
+									unsigned long prot_val = pgprot_val(prot);
 									ALT_THEAD_PMA(prot_val);
 									return __pgd((pfn << _PAGE_PFN_SHIFT) | prot_val);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
-												RISC-V: Setup initial page tables in two stages

Currently, the setup_vm() does initial page table setup in one-shot
very early before enabling MMU. Due to this, the setup_vm() has to map
all possible kernel virtual addresses since it does not know size and
location of RAM. This means we have kernel mappings for non-existent
RAM and any buggy driver (or kernel) code doing out-of-bound access
to RAM will not fault and cause underterministic behaviour.

Further, the setup_vm() creates PMD mappings (i.e. 2M mappings) for
RV64 systems. This means for PAGE_OFFSET=0xffffffe000000000 (i.e.
MAXPHYSMEM_128GB=y), the setup_vm() will require 129 pages (i.e.
516 KB) of memory for initial page tables which is never freed. The
memory required for initial page tables will further increase if
we chose a lower value of PAGE_OFFSET (e.g. 0xffffff0000000000)

This patch implements two-staged initial page table setup, as follows:
1. Early (i.e. setup_vm()): This stage maps kernel image and DTB in
a early page table (i.e. early_pg_dir). The early_pg_dir will be used
only by boot HART so it can be freed as-part of init memory free-up.
2. Final (i.e. setup_vm_final()): This stage maps all possible RAM
banks in the final page table (i.e. swapper_pg_dir). The boot HART
will start using swapper_pg_dir at the end of setup_vm_final(). All
non-boot HARTs directly use the swapper_pg_dir created by boot HART.

We have following advantages with this new approach:
1. Kernel mappings for non-existent RAM don't exists anymore.
2. Memory consumed by initial page tables is now indpendent of the
chosen PAGE_OFFSET.
3. Memory consumed by initial page tables on RV64 system is 2 pages
(i.e. 8 KB) which has significantly reduced and these pages will be
freed as-part of the init memory free-up.

The patch also provides a foundation for implementing strict kernel
mappings where we protect kernel text and rodata using PTE permissions.

Suggested-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Anup Patel <anup.patel@wdc.com>
[paul.walmsley@sifive.com: updated to apply; fixed a checkpatch warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-06-28 20:36:21 +00:00
+								static inline unsigned long _pgd_pfn(pgd_t pgd)
 								{
-												riscv: Fix missing PAGE_PFN_MASK

There are a bunch of functions that use the PFN from a page table entry
that end up with the svpbmt upper-bits because they are missing the newly
introduced PAGE_PFN_MASK which leads to wrong addresses conversions and
then crash: fix this by adding this mask.

Fixes: 100631b48ded ("riscv: Fix accessing pfn bits in PTEs for non-32bit variants")
Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Anup Patel <anup@brainfault.org>

											
										
										
											2022-07-11 03:59:51 +00:00
+									return __page_val_to_pfn(pgd_val(pgd));
-												RISC-V: Setup initial page tables in two stages

Currently, the setup_vm() does initial page table setup in one-shot
very early before enabling MMU. Due to this, the setup_vm() has to map
all possible kernel virtual addresses since it does not know size and
location of RAM. This means we have kernel mappings for non-existent
RAM and any buggy driver (or kernel) code doing out-of-bound access
to RAM will not fault and cause underterministic behaviour.

Further, the setup_vm() creates PMD mappings (i.e. 2M mappings) for
RV64 systems. This means for PAGE_OFFSET=0xffffffe000000000 (i.e.
MAXPHYSMEM_128GB=y), the setup_vm() will require 129 pages (i.e.
516 KB) of memory for initial page tables which is never freed. The
memory required for initial page tables will further increase if
we chose a lower value of PAGE_OFFSET (e.g. 0xffffff0000000000)

This patch implements two-staged initial page table setup, as follows:
1. Early (i.e. setup_vm()): This stage maps kernel image and DTB in
a early page table (i.e. early_pg_dir). The early_pg_dir will be used
only by boot HART so it can be freed as-part of init memory free-up.
2. Final (i.e. setup_vm_final()): This stage maps all possible RAM
banks in the final page table (i.e. swapper_pg_dir). The boot HART
will start using swapper_pg_dir at the end of setup_vm_final(). All
non-boot HARTs directly use the swapper_pg_dir created by boot HART.

We have following advantages with this new approach:
1. Kernel mappings for non-existent RAM don't exists anymore.
2. Memory consumed by initial page tables is now indpendent of the
chosen PAGE_OFFSET.
3. Memory consumed by initial page tables on RV64 system is 2 pages
(i.e. 8 KB) which has significantly reduced and these pages will be
freed as-part of the init memory free-up.

The patch also provides a foundation for implementing strict kernel
mappings where we protect kernel text and rodata using PTE permissions.

Suggested-by: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Anup Patel <anup.patel@wdc.com>
[paul.walmsley@sifive.com: updated to apply; fixed a checkpatch warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-06-28 20:36:21 +00:00
+								}
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								static inline struct page *pmd_page(pmd_t pmd)
 								{
-												riscv: Fix accessing pfn bits in PTEs for non-32bit variants

On rv32 the PFN part of PTEs is defined to use bits [xlen-1:10]
while on rv64 it is defined to use bits [53:10], leaving [63:54]
as reserved.

With upcoming optional extensions like svpbmt these previously
reserved bits will get used so simply right-shifting the PTE
to get the PFN won't be enough.

So introduce a _PAGE_PFN_MASK constant to mask the correct bits
for both rv32 and rv64 before shifting.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Link: https://lore.kernel.org/r/20220511192921.2223629-9-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:17 +00:00
+									return pfn_to_page(__page_val_to_pfn(pmd_val(pmd)));
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 								{
-												riscv: Fix accessing pfn bits in PTEs for non-32bit variants

On rv32 the PFN part of PTEs is defined to use bits [xlen-1:10]
while on rv64 it is defined to use bits [53:10], leaving [63:54]
as reserved.

With upcoming optional extensions like svpbmt these previously
reserved bits will get used so simply right-shifting the PTE
to get the PFN won't be enough.

So introduce a _PAGE_PFN_MASK constant to mask the correct bits
for both rv32 and rv64 before shifting.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Link: https://lore.kernel.org/r/20220511192921.2223629-9-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:17 +00:00
+									return (unsigned long)pfn_to_virt(__page_val_to_pfn(pmd_val(pmd)));
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
-												riscv: Add support pte_protnone and pmd_protnone if CONFIG_NUMA_BALANCING

These two functions are used to distinguish between PROT_NONENUMA
protections and hinting fault protections.

Signed-off-by: Greentime Hu <greentime.hu@sifive.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-11-19 00:38:28 +00:00
+								static inline pte_t pmd_pte(pmd_t pmd)
 								{
 									return __pte(pmd_val(pmd));
 								}
-												riscv: mremap speedup - enable HAVE_MOVE_PUD and HAVE_MOVE_PMD

HAVE_MOVE_PUD enables remapping pages at the PUD level if both the source
and destination addresses are PUD-aligned.
HAVE_MOVE_PMD does similar speedup on the PMD level.

With HAVE_MOVE_PUD enabled, there is about a 143x improvement on qemu
With HAVE_MOVE_PMD enabled, there is about a 5x improvement on qemu

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-16 16:37:22 +00:00
+								static inline pte_t pud_pte(pud_t pud)
 								{
 									return __pte(pud_val(pud));
 								}
-												riscv: mm: modify pte format for Svnapot

Add one alternative to enable/disable svnapot support, enable this static
key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
option is set. It will influence the behavior of has_svnapot. All code
dependent on svnapot should make sure that has_svnapot return true firstly.

Modify PTE definition for Svnapot, and creates some functions in pgtable.h
to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
64KB napot size is supported in spec, so some macros has only 64KB version.

Signed-off-by: Qinglin Pan <panqinglin00@gmail.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20230209131647.17245-2-panqinglin00@gmail.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-02-09 13:16:45 +00:00
+								#ifdef CONFIG_RISCV_ISA_SVNAPOT
-												riscv: Rearrange hwcap.h and cpufeature.h

Now hwcap.h and cpufeature.h are mutually including each other, and most of
the variable/API declarations in hwcap.h are implemented in cpufeature.c,
so, it's better to move them into cpufeature.h and leave only macros for
ISA extension logical IDs in hwcap.h.

BTW, the riscv_isa_extension_mask macro is not used now, so this patch
removes it.

Suggested-by: Andrew Jones <ajones@ventanamicro.com>
Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20231031064553.2319688-2-xiao.w.wang@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-10-31 06:45:52 +00:00
+								#include <asm/cpufeature.h>
-												riscv: mm: modify pte format for Svnapot

Add one alternative to enable/disable svnapot support, enable this static
key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
option is set. It will influence the behavior of has_svnapot. All code
dependent on svnapot should make sure that has_svnapot return true firstly.

Modify PTE definition for Svnapot, and creates some functions in pgtable.h
to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
64KB napot size is supported in spec, so some macros has only 64KB version.

Signed-off-by: Qinglin Pan <panqinglin00@gmail.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20230209131647.17245-2-panqinglin00@gmail.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-02-09 13:16:45 +00:00
 								static __always_inline bool has_svnapot(void)
 								{
 									return riscv_has_extension_likely(RISCV_ISA_EXT_SVNAPOT);
 								}
 								static inline unsigned long pte_napot(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_NAPOT;
 								}
 								static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
 								{
 									int pos = order - 1 + _PAGE_PFN_SHIFT;
 									unsigned long napot_bit = BIT(pos);
 									unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
 									return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
 								}
 								#else
 								static __always_inline bool has_svnapot(void) { return false; }
 								static inline unsigned long pte_napot(pte_t pte)
 								{
 									return 0;
 								}
 								#endif /* CONFIG_RISCV_ISA_SVNAPOT */
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								/* Yields the page frame number (PFN) of a page table entry */
 								static inline unsigned long pte_pfn(pte_t pte)
 								{
-												riscv: mm: modify pte format for Svnapot

Add one alternative to enable/disable svnapot support, enable this static
key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile
option is set. It will influence the behavior of has_svnapot. All code
dependent on svnapot should make sure that has_svnapot return true firstly.

Modify PTE definition for Svnapot, and creates some functions in pgtable.h
to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only
64KB napot size is supported in spec, so some macros has only 64KB version.

Signed-off-by: Qinglin Pan <panqinglin00@gmail.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20230209131647.17245-2-panqinglin00@gmail.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-02-09 13:16:45 +00:00
+									unsigned long res  = __page_val_to_pfn(pte_val(pte));
 									if (has_svnapot() && pte_napot(pte))
 										res = res & (res - 1UL);
 									return res;
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								#define pte_page(x)     pfn_to_page(pte_pfn(x))
 								/* Constructs a page table entry */
 								static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 								{
-												riscv: add memory-type errata for T-Head

Some current cpus based on T-Head cores implement memory-types
way different than described in the svpbmt spec even going
so far as using PTE bits marked as reserved.

Add the T-Head vendor-id and necessary errata code to
replace the affected instructions.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Samuel Holland <samuel@sholland.org>
Link: https://lore.kernel.org/r/20220511192921.2223629-13-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:21 +00:00
+									unsigned long prot_val = pgprot_val(prot);
 									ALT_THEAD_PMA(prot_val);
 									return __pte((pfn << _PAGE_PFN_SHIFT) | prot_val);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
-												riscv: Fix implicit declaration of 'page_to_section'

With CONFIG_SPARSEMEM and !CONFIG_SPARSEMEM_VMEMMAP,

arch/riscv/include/asm/pgtable.h: In function ‘mk_pte’:
include/asm-generic/memory_model.h:64:14: error: implicit declaration of function ‘page_to_section’; did you mean ‘present_section’? [-Werror=implicit-function-declaration]
  int __sec = page_to_section(__pg);   \
              ^~~~~~~~~~~~~~~

Fixed by changing mk_pte() from inline function to macro.

Cc: Logan Gunthorpe <logang@deltatee.com>
Fixes: d95f1a542c3d ("RISC-V: Implement sparsemem")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
[paul.walmsley@sifive.com: fixed checkpatch errors]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-23 03:23:01 +00:00
+								#define mk_pte(page, prot)       pfn_pte(page_to_pfn(page), prot)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
 								static inline int pte_present(pte_t pte)
 								{
-												riscv: Add pte bit to distinguish swap from invalid

Previously, invalid PTEs and swap PTEs had the same binary
representation, causing errors when attempting to unmap PROT_NONE
mappings, including implicit unmap on exit.

Typical error:

swap_info_get: Bad swap file entry 40000000007a9879
BUG: Bad page map in process a.out  pte:3d4c3cc0 pmd:3e521401

Cc: stable@vger.kernel.org
Signed-off-by: Stefan O'Rear <sorear2@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2018-12-16 18:03:36 +00:00
+									return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE));
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								static inline int pte_none(pte_t pte)
 								{
 									return (pte_val(pte) == 0);
 								}
 								static inline int pte_write(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_WRITE;
 								}
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
+								static inline int pte_exec(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_EXEC;
 								}
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								static inline int pte_user(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_USER;
 								}
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								static inline int pte_huge(pte_t pte)
 								{
-												riscv: mm: add _PAGE_LEAF macro

In riscv, a page table entry is leaf when any bit of read, write,
or execute bit is set. So add a macro:_PAGE_LEAF instead of
(_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC), which is frequently used
to determine if it is a leaf page. This make code easier to read,
without any functional change.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:47 +00:00
+									return pte_present(pte) && (pte_val(pte) & _PAGE_LEAF);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								static inline int pte_dirty(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_DIRTY;
 								}
 								static inline int pte_young(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_ACCESSED;
 								}
 								static inline int pte_special(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_SPECIAL;
 								}
 								/* static inline pte_t pte_rdprotect(pte_t pte) */
 								static inline pte_t pte_wrprotect(pte_t pte)
 								{
 									return __pte(pte_val(pte) & ~(_PAGE_WRITE));
 								}
 								/* static inline pte_t pte_mkread(pte_t pte) */
-												mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()

The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). The goal is to make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable or shadow stack mappings.

But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.

So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.

Start the process by renaming pte_mkwrite() to pte_mkwrite_novma() and
adding the pte_mkwrite() wrapper in linux/pgtable.h. Apply the same
pattern for pmd_mkwrite(). Since not all archs have a pmd_mkwrite_novma(),
create a new arch config HAS_HUGE_PAGE that can be used to tell if
pmd_mkwrite() should be defined. Otherwise in the !HAS_HUGE_PAGE cases the
compiler would not be able to find pmd_mkwrite_novma().

No functional change.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/lkml/CAHk-=wiZjSu7c9sFYZb3q04108stgHff2wfbokGCCgW7riz+8Q@mail.gmail.com/
Link: https://lore.kernel.org/all/20230613001108.3040476-2-rick.p.edgecombe%40intel.com

											
										
										
											2023-06-13 00:10:27 +00:00
+								static inline pte_t pte_mkwrite_novma(pte_t pte)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								{
 									return __pte(pte_val(pte) | _PAGE_WRITE);
 								}
 								/* static inline pte_t pte_mkexec(pte_t pte) */
 								static inline pte_t pte_mkdirty(pte_t pte)
 								{
 									return __pte(pte_val(pte) | _PAGE_DIRTY);
 								}
 								static inline pte_t pte_mkclean(pte_t pte)
 								{
 									return __pte(pte_val(pte) & ~(_PAGE_DIRTY));
 								}
 								static inline pte_t pte_mkyoung(pte_t pte)
 								{
 									return __pte(pte_val(pte) | _PAGE_ACCESSED);
 								}
 								static inline pte_t pte_mkold(pte_t pte)
 								{
 									return __pte(pte_val(pte) & ~(_PAGE_ACCESSED));
 								}
 								static inline pte_t pte_mkspecial(pte_t pte)
 								{
 									return __pte(pte_val(pte) | _PAGE_SPECIAL);
 								}
-												riscv: Introduce huge page support for 32/64bit kernel

This patch implements both 4MB huge page support for 32bit kernel
and 2MB/1GB huge pages support for 64bit kernel.

Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-05-26 12:50:38 +00:00
+								static inline pte_t pte_mkhuge(pte_t pte)
 								{
 									return pte;
 								}
-												riscv: Fix compilation error with FAST_GUP and rv32

By surrounding the definition of pte_leaf_size() with a ifdef napot as
it should have been.

Fixes: e0fe5ab4192c ("riscv: Fix pte_leaf_size() for NAPOT")
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Link: https://lore.kernel.org/r/20240304080247.387710-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-03-04 08:02:47 +00:00
+								#ifdef CONFIG_RISCV_ISA_SVNAPOT
-												riscv: Fix pte_leaf_size() for NAPOT

pte_leaf_size() must be reimplemented to add support for NAPOT mappings.

Fixes: 82a1a1f3bfb6 ("riscv: mm: support Svnapot in hugetlb page")
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20240227205016.121901-3-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-27 20:50:16 +00:00
+								#define pte_leaf_size(pte)	(pte_napot(pte) ?				\
 													napot_cont_size(napot_cont_order(pte)) :\
 													PAGE_SIZE)
-												riscv: Fix compilation error with FAST_GUP and rv32

By surrounding the definition of pte_leaf_size() with a ifdef napot as
it should have been.

Fixes: e0fe5ab4192c ("riscv: Fix pte_leaf_size() for NAPOT")
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Link: https://lore.kernel.org/r/20240304080247.387710-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-03-04 08:02:47 +00:00
+								#endif
-												riscv: Fix pte_leaf_size() for NAPOT

pte_leaf_size() must be reimplemented to add support for NAPOT mappings.

Fixes: 82a1a1f3bfb6 ("riscv: mm: support Svnapot in hugetlb page")
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20240227205016.121901-3-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-27 20:50:16 +00:00
-												riscv: Add support pte_protnone and pmd_protnone if CONFIG_NUMA_BALANCING

These two functions are used to distinguish between PROT_NONENUMA
protections and hinting fault protections.

Signed-off-by: Greentime Hu <greentime.hu@sifive.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-11-19 00:38:28 +00:00
+								#ifdef CONFIG_NUMA_BALANCING
 								/*
 								 * See the comment in include/asm-generic/pgtable.h
 								 */
 								static inline int pte_protnone(pte_t pte)
 								{
 									return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) == _PAGE_PROT_NONE;
 								}
 								static inline int pmd_protnone(pmd_t pmd)
 								{
 									return pte_protnone(pmd_pte(pmd));
 								}
 								#endif
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								/* Modify page protection bits */
 								static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 								{
-												riscv: add memory-type errata for T-Head

Some current cpus based on T-Head cores implement memory-types
way different than described in the svpbmt spec even going
so far as using PTE bits marked as reserved.

Add the T-Head vendor-id and necessary errata code to
replace the affected instructions.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Samuel Holland <samuel@sholland.org>
Link: https://lore.kernel.org/r/20220511192921.2223629-13-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:21 +00:00
+									unsigned long newprot_val = pgprot_val(newprot);
 									ALT_THEAD_PMA(newprot_val);
 									return __pte((pte_val(pte) & _PAGE_CHG_MASK) | newprot_val);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								#define pgd_ERROR(e) \
 									pr_err("%s:%d: bad pgd " PTE_FMT ".\n", __FILE__, __LINE__, pgd_val(e))
 								/* Commit new configuration to MMU hardware */
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+								static inline void update_mmu_cache_range(struct vm_fault *vmf,
 										struct vm_area_struct *vma, unsigned long address,
 										pte_t *ptep, unsigned int nr)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								{
 									/*
 									 * The kernel assumes that TLBs don't cache invalid entries, but
 									 * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
 									 * cache flush; it is necessary even after writing invalid entries.
 									 * Relying on flush_tlb_fix_spurious_fault would suffice, but
 									 * the extra traps reduce performance.  So, eagerly SFENCE.VMA.
 									 */
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+									while (nr--)
 										local_flush_tlb_page(address + nr * PAGE_SIZE);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+								#define update_mmu_cache(vma, addr, ptep) \
 									update_mmu_cache_range(NULL, vma, addr, ptep, 1)
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
-												riscv: support update_mmu_tlb()

Add macro definition to support update_mmu_tlb() for riscv,
this function is from commit:7df676974359 ("mm/memory.c:Update
local TLB if PTE entry exists").

update_mmu_tlb() is used when a thread notice that other cpu thread
has handled the fault and changed the PTE. For MIPS, it's worth to
do that,this cpu thread will trap in tlb fault again otherwise.

For RISCV, it's also better to flush local tlb than do nothing in
update_mmu_tlb(). There are two kinds of page fault that have
update_mmu_tlb() inside:

1.page fault which PTE is NOT none, only protection check error,
like write protection fault. If updata_mmu_tlb() is empty, after
finsh page fault this time and re-execute, cpu will find address
but protection checked error in tlb again. So this will cause
another page fault. PTE in memory is good now,so update_mmu_cache()
in handle_pte_fault() will be executed. If updata_mmu_tlb() is not
empty flush local tlb, cpu won't find this address in tlb next time,
and get entry in physical memory, so it won't cause another page
fault.

2.page fault which PTE is none or swapped.
For this case, this cpu thread won't cause another page fault,cpu
will have tlb miss when re-execute, and get entry in memory
directly. But "set pte in phycial memory and flush local tlb" is
pratice in Linux, it's better to flush local tlb if it find entry
in phycial memory has changed.

Maybe it's same for other ARCH which can't detect PTE changed and
update it in local tlb automatically.

Signed-off-by: Jinyu Tang <tjytimi@163.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20221009134503.18783-1-tjytimi@163.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-10-09 13:45:03 +00:00
+								#define __HAVE_ARCH_UPDATE_MMU_TLB
 								#define update_mmu_tlb update_mmu_cache
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 										unsigned long address, pmd_t *pmdp)
 								{
 									pte_t *ptep = (pte_t *)pmdp;
 									update_mmu_cache(vma, address, ptep);
 								}
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#define __HAVE_ARCH_PTE_SAME
 								static inline int pte_same(pte_t pte_a, pte_t pte_b)
 								{
 									return pte_val(pte_a) == pte_val(pte_b);
 								}
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
+								/*
 								 * Certain architectures need to do special things when PTEs within
 								 * a page table are directly modified.  Thus, the following hook is
 								 * made available.
 								 */
 								static inline void set_pte(pte_t *ptep, pte_t pteval)
 								{
-												riscv: Use WRITE_ONCE() when setting page table entries

To avoid any compiler "weirdness" when accessing page table entries which
are concurrently modified by the HW, let's use WRITE_ONCE() macro
(commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing
page tables") gives a great explanation with more details).

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20231213203001.179237-2-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-12-13 20:29:58 +00:00
+									WRITE_ONCE(*ptep, pteval);
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
+								}
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+								void flush_icache_pte(struct mm_struct *mm, pte_t pte);
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+								static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
+								{
 									if (pte_present(pteval) && pte_exec(pteval))
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+										flush_icache_pte(mm, pteval);
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
 									set_pte(ptep, pteval);
 								}
-												riscv/pgtable: define PFN_PTE_SHIFT

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2024-01-29 12:46:39 +00:00
+								#define PFN_PTE_SHIFT		_PAGE_PFN_SHIFT
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+								static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 										pte_t *ptep, pte_t pteval, unsigned int nr)
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								{
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+									page_table_check_ptes_set(mm, ptep, pteval, nr);
 									for (;;) {
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+										__set_pte_at(mm, ptep, pteval);
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+										if (--nr == 0)
 											break;
 										ptep++;
 										pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
 									}
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								}
-												riscv: implement the new page table range API

Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio().  Change
the PG_dcache_clean flag from being per-page to per-folio.

Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-08-02 15:13:50 +00:00
+								#define set_ptes set_ptes
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
+								static inline void pte_clear(struct mm_struct *mm,
 									unsigned long addr, pte_t *ptep)
 								{
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+									__set_pte_at(mm, ptep, __pte(0));
-												RISC-V: Flush I$ when making a dirty page executable

The RISC-V ISA allows for instruction caches that are not coherent WRT
stores, even on a single hart.  As a result, we need to explicitly flush
the instruction cache whenever marking a dirty page as executable in
order to preserve the correct system behavior.

Local instruction caches aren't that scary (our implementations actually
flush the cache, but RISC-V is defined to allow higher-performance
implementations to exist), but RISC-V defines no way to perform an
instruction cache shootdown.  When explicitly asked to do so we can
shoot down remote instruction caches via an IPI, but this is a bit on
the slow side.

Instead of requiring an IPI to all harts whenever marking a page as
executable, we simply flush the currently running harts.  In order to
maintain correct behavior, we additionally mark every other hart as
needing a deferred instruction cache which will be taken before anything
runs on it.

Signed-off-by: Andrew Waterman <andrew@sifive.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

											
										
										
											2017-10-25 21:30:32 +00:00
+								}
-												riscv: Use accessors to page table entries instead of direct dereference

As very well explained in commit 20a004e7b017 ("arm64: mm: Use
READ_ONCE/WRITE_ONCE when accessing page tables"), an architecture whose
page table walker can modify the PTE in parallel must use
READ_ONCE()/WRITE_ONCE() macro to avoid any compiler transformation.

So apply that to riscv which is such architecture.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Acked-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20231213203001.179237-5-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-12-13 20:30:01 +00:00
+								#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS	/* defined in mm/pgtable.c */
 								extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 												 pte_t *ptep, pte_t entry, int dirty);
 								#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG	/* defined in mm/pgtable.c */
 								extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
 												     pte_t *ptep);
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
 								#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 								static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 												       unsigned long address, pte_t *ptep)
 								{
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+									pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));
-												mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear

Remove unused addr in page_table_check_pte_clear and
__page_table_check_pte_clear.

Link: https://lkml.kernel.org/r/20230713172636.1705415-4-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-07-13 17:26:31 +00:00
+									page_table_check_pte_clear(mm, pte);
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
 									return pte;
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								}
 								#define __HAVE_ARCH_PTEP_SET_WRPROTECT
 								static inline void ptep_set_wrprotect(struct mm_struct *mm,
 												      unsigned long address, pte_t *ptep)
 								{
 									atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
 								}
 								#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 								static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 													 unsigned long address, pte_t *ptep)
 								{
 									/*
 									 * This comment is borrowed from x86, but applies equally to RISC-V:
 									 *
 									 * Clearing the accessed bit without a TLB flush
 									 * doesn't cause data corruption. [ It could cause incorrect
 									 * page aging and the (mistaken) reclaim of hot pages, but the
 									 * chance of that should be relatively low. ]
 									 *
 									 * So as a performance optimization don't flush the TLB when
 									 * clearing the accessed bit, it will eventually be flushed by
 									 * a context switch or a VM operation anyway. [ In the rare
 									 * event of it not getting flushed for a long time the delay
 									 * shouldn't really matter because there's no real memory
 									 * pressure for swapout to react to. ]
 									 */
 									return ptep_test_and_clear_young(vma, address, ptep);
 								}
-												riscv: add RISC-V Svpbmt extension support

Svpbmt (the S should be capitalized) is the
"Supervisor-mode: page-based memory types" extension
that specifies attributes for cacheability, idempotency
and ordering.

The relevant settings are done in special bits in PTEs:

Here is the svpbmt PTE format:
| 63 | 62-61 | 60-8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
  N     MT     RSW    D   A   G   U   X   W   R   V
        ^

Of the Reserved bits [63:54] in a leaf PTE, the high bit is already
allocated (as the N bit), so bits [62:61] are used as the MT (aka
MemType) field. This field specifies one of three memory types that
are close equivalents (or equivalent in effect) to the three main x86
and ARMv8 memory types - as shown in the following table.

RISC-V
Encoding &
MemType     RISC-V Description
----------  ------------------------------------------------
00 - PMA    Normal Cacheable, No change to implied PMA memory type
01 - NC     Non-cacheable, idempotent, weakly-ordered Main Memory
10 - IO     Non-cacheable, non-idempotent, strongly-ordered I/O memory
11 - Rsvd   Reserved for future standard use

As the extension will not be present on all implementations,
implement a method to handle cpufeatures via alternatives
to not incur runtime penalties on cpu variants not supporting
specific extensions and patch relevant code parts at runtime.

Co-developed-by: Wei Fu <wefu@redhat.com>
Signed-off-by: Wei Fu <wefu@redhat.com>
Co-developed-by: Liu Shaohua <liush@allwinnertech.com>
Signed-off-by: Liu Shaohua <liush@allwinnertech.com>
Co-developed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@kernel.org>
[moved to use the alternatives mechanism]
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
Link: https://lore.kernel.org/r/20220511192921.2223629-10-heiko@sntech.de
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-05-11 19:29:18 +00:00
+								#define pgprot_noncached pgprot_noncached
 								static inline pgprot_t pgprot_noncached(pgprot_t _prot)
 								{
 									unsigned long prot = pgprot_val(_prot);
 									prot &= ~_PAGE_MTMASK;
 									prot |= _PAGE_IO;
 									return __pgprot(prot);
 								}
 								#define pgprot_writecombine pgprot_writecombine
 								static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
 								{
 									unsigned long prot = pgprot_val(_prot);
 									prot &= ~_PAGE_MTMASK;
 									prot |= _PAGE_NOCACHE;
 									return __pgprot(prot);
 								}
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								/*
 								 * THP functions
 								 */
 								static inline pmd_t pte_pmd(pte_t pte)
 								{
 									return __pmd(pte_val(pte));
 								}
 								static inline pmd_t pmd_mkhuge(pmd_t pmd)
 								{
 									return pmd;
 								}
 								static inline pmd_t pmd_mkinvalid(pmd_t pmd)
 								{
 									return __pmd(pmd_val(pmd) & ~(_PAGE_PRESENT|_PAGE_PROT_NONE));
 								}
-												riscv: Fix missing PAGE_PFN_MASK

There are a bunch of functions that use the PFN from a page table entry
that end up with the svpbmt upper-bits because they are missing the newly
introduced PAGE_PFN_MASK which leads to wrong addresses conversions and
then crash: fix this by adding this mask.

Fixes: 100631b48ded ("riscv: Fix accessing pfn bits in PTEs for non-32bit variants")
Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Anup Patel <anup@brainfault.org>

											
										
										
											2022-07-11 03:59:51 +00:00
+								#define __pmd_to_phys(pmd)  (__page_val_to_pfn(pmd_val(pmd)) << PAGE_SHIFT)
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
 								static inline unsigned long pmd_pfn(pmd_t pmd)
 								{
 									return ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT);
 								}
-												riscv: Fix missing PAGE_PFN_MASK

There are a bunch of functions that use the PFN from a page table entry
that end up with the svpbmt upper-bits because they are missing the newly
introduced PAGE_PFN_MASK which leads to wrong addresses conversions and
then crash: fix this by adding this mask.

Fixes: 100631b48ded ("riscv: Fix accessing pfn bits in PTEs for non-32bit variants")
Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Anup Patel <anup@brainfault.org>

											
										
										
											2022-07-11 03:59:51 +00:00
+								#define __pud_to_phys(pud)  (__page_val_to_pfn(pud_val(pud)) << PAGE_SHIFT)
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
 								static inline unsigned long pud_pfn(pud_t pud)
 								{
 									return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT);
 								}
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 								{
 									return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
 								}
 								#define pmd_write pmd_write
 								static inline int pmd_write(pmd_t pmd)
 								{
 									return pte_write(pmd_pte(pmd));
 								}
-												riscv: enable HAVE_FAST_GUP if MMU

Activate the fast gup for riscv mmu platforms. Here are some
GUP_FAST_BENCHMARK performance numbers:

Before the patch:
GUP_FAST_BENCHMARK: Time: get:53203 put:5085 us

After the patch:
GUP_FAST_BENCHMARK: Time: get:17711 put:5060 us

The get time is reduced by 66.7%! IOW, 3x get speed!

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20231219175046.2496-5-jszhang@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-12-19 17:50:46 +00:00
+								#define pud_write pud_write
 								static inline int pud_write(pud_t pud)
 								{
 									return pte_write(pud_pte(pud));
 								}
-												mm/mglru: add dummy pmd_dirty()

Add dummy pmd_dirty() for architectures that don't provide it.
This is similar to commit 6617da8fb565 ("mm: add dummy pmd_young()
for architectures not having it").

Link: https://lkml.kernel.org/r/20231227141205.2200125-5-kinseyho@google.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202312210606.1Etqz3M4-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202312210042.xQEiqlEh-lkp@intel.com/
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Suggested-by: Yu Zhao <yuzhao@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Donet Tom <donettom@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-12-27 14:12:04 +00:00
+								#define pmd_dirty pmd_dirty
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								static inline int pmd_dirty(pmd_t pmd)
 								{
 									return pte_dirty(pmd_pte(pmd));
 								}
-												mm: add dummy pmd_young() for architectures not having it

In order to avoid #ifdeffery add a dummy pmd_young() implementation as a
fallback.  This is required for the later patch "mm: introduce
arch_has_hw_nonleaf_pmd_young()".

Link: https://lkml.kernel.org/r/fd3ac3cd-7349-6bbd-890a-71a9454ca0b3@suse.com
Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-11-30 22:49:41 +00:00
+								#define pmd_young pmd_young
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								static inline int pmd_young(pmd_t pmd)
 								{
 									return pte_young(pmd_pte(pmd));
 								}
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								static inline int pmd_user(pmd_t pmd)
 								{
 									return pte_user(pmd_pte(pmd));
 								}
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								static inline pmd_t pmd_mkold(pmd_t pmd)
 								{
 									return pte_pmd(pte_mkold(pmd_pte(pmd)));
 								}
 								static inline pmd_t pmd_mkyoung(pmd_t pmd)
 								{
 									return pte_pmd(pte_mkyoung(pmd_pte(pmd)));
 								}
-												mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()

The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). The goal is to make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable or shadow stack mappings.

But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.

So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.

Start the process by renaming pte_mkwrite() to pte_mkwrite_novma() and
adding the pte_mkwrite() wrapper in linux/pgtable.h. Apply the same
pattern for pmd_mkwrite(). Since not all archs have a pmd_mkwrite_novma(),
create a new arch config HAS_HUGE_PAGE that can be used to tell if
pmd_mkwrite() should be defined. Otherwise in the !HAS_HUGE_PAGE cases the
compiler would not be able to find pmd_mkwrite_novma().

No functional change.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/lkml/CAHk-=wiZjSu7c9sFYZb3q04108stgHff2wfbokGCCgW7riz+8Q@mail.gmail.com/
Link: https://lore.kernel.org/all/20230613001108.3040476-2-rick.p.edgecombe%40intel.com

											
										
										
											2023-06-13 00:10:27 +00:00
+								static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								{
-												mm: Rename arch pte_mkwrite()'s to pte_mkwrite_novma()

The x86 Shadow stack feature includes a new type of memory called shadow
stack. This shadow stack memory has some unusual properties, which requires
some core mm changes to function properly.

One of these unusual properties is that shadow stack memory is writable,
but only in limited ways. These limits are applied via a specific PTE
bit combination. Nevertheless, the memory is writable, and core mm code
will need to apply the writable permissions in the typical paths that
call pte_mkwrite(). The goal is to make pte_mkwrite() take a VMA, so
that the x86 implementation of it can know whether to create regular
writable or shadow stack mappings.

But there are a couple of challenges to this. Modifying the signatures of
each arch pte_mkwrite() implementation would be error prone because some
are generated with macros and would need to be re-implemented. Also, some
pte_mkwrite() callers operate on kernel memory without a VMA.

So this can be done in a three step process. First pte_mkwrite() can be
renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite()
added that just calls pte_mkwrite_novma(). Next callers without a VMA can
be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers
can be changed to take/pass a VMA.

Start the process by renaming pte_mkwrite() to pte_mkwrite_novma() and
adding the pte_mkwrite() wrapper in linux/pgtable.h. Apply the same
pattern for pmd_mkwrite(). Since not all archs have a pmd_mkwrite_novma(),
create a new arch config HAS_HUGE_PAGE that can be used to tell if
pmd_mkwrite() should be defined. Otherwise in the !HAS_HUGE_PAGE cases the
compiler would not be able to find pmd_mkwrite_novma().

No functional change.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/lkml/CAHk-=wiZjSu7c9sFYZb3q04108stgHff2wfbokGCCgW7riz+8Q@mail.gmail.com/
Link: https://lore.kernel.org/all/20230613001108.3040476-2-rick.p.edgecombe%40intel.com

											
										
										
											2023-06-13 00:10:27 +00:00
+									return pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)));
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								}
 								static inline pmd_t pmd_wrprotect(pmd_t pmd)
 								{
 									return pte_pmd(pte_wrprotect(pmd_pte(pmd)));
 								}
 								static inline pmd_t pmd_mkclean(pmd_t pmd)
 								{
 									return pte_pmd(pte_mkclean(pmd_pte(pmd)));
 								}
 								static inline pmd_t pmd_mkdirty(pmd_t pmd)
 								{
 									return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
 								}
 								static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 												pmd_t *pmdp, pmd_t pmd)
 								{
-												mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set

Remove unused addr in __page_table_check_pmd_set and
page_table_check_pmd_set.

Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-07-13 17:26:35 +00:00
+									page_table_check_pmd_set(mm, pmdp, pmd);
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+									return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd));
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								}
-												riscv: mremap speedup - enable HAVE_MOVE_PUD and HAVE_MOVE_PMD

HAVE_MOVE_PUD enables remapping pages at the PUD level if both the source
and destination addresses are PUD-aligned.
HAVE_MOVE_PMD does similar speedup on the PMD level.

With HAVE_MOVE_PUD enabled, there is about a 143x improvement on qemu
With HAVE_MOVE_PMD enabled, there is about a 5x improvement on qemu

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-16 16:37:22 +00:00
+								static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
 												pud_t *pudp, pud_t pud)
 								{
-												mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set

Remove unused addr in __page_table_check_pud_set and
page_table_check_pud_set.

Link: https://lkml.kernel.org/r/20230713172636.1705415-9-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-07-13 17:26:36 +00:00
+									page_table_check_pud_set(mm, pudp, pud);
-												riscv: Only flush the mm icache when setting an exec pte

We used to emit a flush_icache_all() whenever a dirty executable
mapping is set in the page table but we can instead call
flush_icache_mm() which will only send IPIs to cores that currently run
this mm and add a deferred icache flush to the others.

The number of calls to sbi_remote_fence_i() (tested without IPI
support):

With a simple buildroot rootfs:
* Before: ~5k
* After :  4 (!)

Tested on HW, the boot to login is ~4.5% faster.

With an ubuntu rootfs:
* Before: ~24k
* After : ~13k

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-02-02 12:47:11 +00:00
+									return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud));
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								}
 								#ifdef CONFIG_PAGE_TABLE_CHECK
 								static inline bool pte_user_accessible_page(pte_t pte)
 								{
 									return pte_present(pte) && pte_user(pte);
 								}
 								static inline bool pmd_user_accessible_page(pmd_t pmd)
 								{
 									return pmd_leaf(pmd) && pmd_user(pmd);
-												riscv: mremap speedup - enable HAVE_MOVE_PUD and HAVE_MOVE_PMD

HAVE_MOVE_PUD enables remapping pages at the PUD level if both the source
and destination addresses are PUD-aligned.
HAVE_MOVE_PMD does similar speedup on the PMD level.

With HAVE_MOVE_PUD enabled, there is about a 143x improvement on qemu
With HAVE_MOVE_PMD enabled, there is about a 5x improvement on qemu

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-16 16:37:22 +00:00
+								}
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+								static inline bool pud_user_accessible_page(pud_t pud)
 								{
 									return pud_leaf(pud) && pud_user(pud);
 								}
 								#endif
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 								static inline int pmd_trans_huge(pmd_t pmd)
 								{
 									return pmd_leaf(pmd);
 								}
 								#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 								static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 													unsigned long address, pmd_t *pmdp,
 													pmd_t entry, int dirty)
 								{
 									return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
 								}
 								#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 								static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 													unsigned long address, pmd_t *pmdp)
 								{
 									return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
 								}
 								#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 								static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 													unsigned long address, pmd_t *pmdp)
 								{
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
+									pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0));
-												mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear

Remove unused addr in page_table_check_pmd_clear and
__page_table_check_pmd_clear.

Link: https://lkml.kernel.org/r/20230713172636.1705415-5-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-07-13 17:26:32 +00:00
+									page_table_check_pmd_clear(mm, pmd);
-												riscv/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK

As commit d283d422c6c4 ("x86: mm: add x86_64 support for page table
check"), enable ARCH_SUPPORTS_PAGE_TABLE_CHECK on riscv.

Add additional page table check stubs for page table helpers, these stubs
can be used to check the existing page table entries.

Link: https://lkml.kernel.org/r/20220507110114.4128854-7-tongtiangen@huawei.com
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2022-05-13 03:23:06 +00:00
 									return pmd;
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								}
 								#define __HAVE_ARCH_PMDP_SET_WRPROTECT
 								static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 													unsigned long address, pmd_t *pmdp)
 								{
 									ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
 								}
 								#define pmdp_establish pmdp_establish
 								static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 												unsigned long address, pmd_t *pmdp, pmd_t pmd)
 								{
-												mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set

Remove unused addr in __page_table_check_pmd_set and
page_table_check_pmd_set.

Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-07-13 17:26:35 +00:00
+									page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+									return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd)));
 								}
-												riscv: mm: Implement pmdp_collapse_flush for THP

When THP is enabled, 4K pages are collapsed into a single huge
page using the generic pmdp_collapse_flush() which will further
use flush_tlb_range() to shoot-down stale TLB entries. Unfortunately,
the generic pmdp_collapse_flush() only invalidates cached leaf PTEs
using address specific SFENCEs which results in repetitive (or
unpredictable) page faults on RISC-V implementations which cache
non-leaf PTEs.

Provide a RISC-V specific pmdp_collapse_flush() which ensures both
cached leaf and non-leaf PTEs are invalidated by using non-address
specific SFENCEs as recommended by the RISC-V privileged specification.

Fixes: e88b333142e4 ("riscv: mm: add THP support on 64-bit")
Signed-off-by: Mayuresh Chitale <mchitale@ventanamicro.com>
Link: https://lore.kernel.org/r/20230130074815.1694055-1-mchitale@ventanamicro.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-01-30 07:48:15 +00:00
 								#define pmdp_collapse_flush pmdp_collapse_flush
 								extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 												 unsigned long address, pmd_t *pmdp);
-												riscv: mm: add THP support on 64-bit

Bring Transparent HugePage support to riscv. A
transparent huge page is always represented as a pmd.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-30 08:28:50 +00:00
+								#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								/*
-												riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE

Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the
offset.  This reduces the maximum swap space per file: on 32bit to 16 GiB
(was 32 GiB).

Note that this bit does not conflict with swap PMDs and could also be used
in swap PMD context later.

While at it, mask the type in __swp_entry().

Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-01-13 17:10:19 +00:00
+								 * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
 								 * are !pte_none() && !pte_present().
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								 *
 								 * Format of swap PTE:
 								 *	bit            0:	_PAGE_PRESENT (zero)
-												riscv/mm: Adjust PAGE_PROT_NONE to comply with THP semantics

This is a preparation for enabling THP migration.
As the commit b65399f6111b("arm64/mm: Change THP helpers
to comply with generic MM semantics") mentioned, pmd_present()
and pmd_trans_huge() are expected to behave in the following
manner:
-------------------------------------------------------------------------
|	PMD states	|	pmd_present	|	pmd_trans_huge	|
-------------------------------------------------------------------------
|	Mapped		|	Yes		|	Yes		|
-------------------------------------------------------------------------
|	Splitting	|	Yes		|	Yes		|
-------------------------------------------------------------------------
|	Migration/Swap	|	No		|	No		|
-------------------------------------------------------------------------

At present the PROT_NONE bit reuses the READ bit could not comply with
above semantics with two problems:
1. When splitting a PMD THP, PMD is first invalidated with
pmdp_invalidate()->pmd_mkinvalid(), which clears the PRESENT bit
and PROT_NONE bit/READ bit, if the PMD is read-only, then the PAGE_LEAF
property is also cleared, which results in pmd_present() return false.
2. When migrating, the swap entry only clear the PRESENT bit
and PROT_NONE bit/READ bit, the W/X bit may be set, so _PAGE_LEAF may be
true which results in pmd_present() return true.

Solution:
Adjust PROT_NONE bit from READ to GLOBAL bit can satisfy the above rules:
1. GLOBAL bit has no other meanings, not like the R/W/X bit, which is
also relative with _PAGE_LEAF property.
2. GLOBAL bit is at bit 5, making swap entry start from bit 6, bit 0-5
are zero, which means the PRESENT, PROT_NONE, and PAGE_LEAF are
all false, then the pmd_present() and pmd_trans_huge() return false when
in migration/swap.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-11-23 14:06:37 +00:00
+								 *	bit       1 to 3:       _PAGE_LEAF (zero)
 								 *	bit            5:	_PAGE_PROT_NONE (zero)
-												riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE

Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the
offset.  This reduces the maximum swap space per file: on 32bit to 16 GiB
(was 32 GiB).

Note that this bit does not conflict with swap PMDs and could also be used
in swap PMD context later.

While at it, mask the type in __swp_entry().

Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-01-13 17:10:19 +00:00
+								 *	bit            6:	exclusive marker
 								 *	bits      7 to 11:	swap type
-												riscv/mm: Fix the comment for swap pte format

Swap type takes bits 7-11 and swap offset should start from bit 12.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20230921141652.2657054-1-xiao.w.wang@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-09-21 14:16:52 +00:00
+								 *	bits 12 to XLEN-1:	swap offset
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								 */
-												riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE

Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the
offset.  This reduces the maximum swap space per file: on 32bit to 16 GiB
(was 32 GiB).

Note that this bit does not conflict with swap PMDs and could also be used
in swap PMD context later.

While at it, mask the type in __swp_entry().

Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-01-13 17:10:19 +00:00
+								#define __SWP_TYPE_SHIFT	7
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#define __SWP_TYPE_BITS		5
 								#define __SWP_TYPE_MASK		((1UL << __SWP_TYPE_BITS) - 1)
 								#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
 								#define MAX_SWAPFILES_CHECK()	\
 									BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
 								#define __swp_type(x)	(((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
 								#define __swp_offset(x)	((x).val >> __SWP_OFFSET_SHIFT)
 								#define __swp_entry(type, offset) ((swp_entry_t) \
-												riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE

Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the
offset.  This reduces the maximum swap space per file: on 32bit to 16 GiB
(was 32 GiB).

Note that this bit does not conflict with swap PMDs and could also be used
in swap PMD context later.

While at it, mask the type in __swp_entry().

Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-01-13 17:10:19 +00:00
+									{ (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \
 									  ((offset) << __SWP_OFFSET_SHIFT) })
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
 								#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 								#define __swp_entry_to_pte(x)	((pte_t) { (x).val })
-												riscv/mm: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE

Let's support __HAVE_ARCH_PTE_SWP_EXCLUSIVE by stealing one bit from the
offset.  This reduces the maximum swap space per file: on 32bit to 16 GiB
(was 32 GiB).

Note that this bit does not conflict with swap PMDs and could also be used
in swap PMD context later.

While at it, mask the type in __swp_entry().

Link: https://lkml.kernel.org/r/20230113171026.582290-20-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-01-13 17:10:19 +00:00
+								static inline int pte_swp_exclusive(pte_t pte)
 								{
 									return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 								}
 								static inline pte_t pte_swp_mkexclusive(pte_t pte)
 								{
 									return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE);
 								}
 								static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 								{
 									return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE);
 								}
-												riscv/mm: Enable THP migration

Add two THP helpers required to create PMD migration swap entries,
and enable THP migration via ARCH_ENABLE_THP_MIGRATION. This can
reduce time of THP migration without splitting and guarantee the
migrated pages are still contiguous.

Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-11-23 14:06:38 +00:00
+								#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 								#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) })
 								#define __swp_entry_to_pmd(swp) __pmd((swp).val)
 								#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
-												riscv: Add support to dump the kernel page tables

In a similar manner to arm64, x86, powerpc, etc., it can traverse all
page tables, and dump the page table layout with the memory types and
permissions.

Add a debugfs file at /sys/kernel/debug/kernel_page_tables to export
the page table layout to userspace.

Signed-off-by: Zong Li <zong.li@sifive.com>
Tested-by: Alexandre Ghiti <alex@ghiti.fr>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-03-12 02:58:35 +00:00
+								/*
 								 * In the RV64 Linux scheme, we give the user half of the virtual-address space
 								 * and give the kernel the other (upper) half.
 								 */
 								#ifdef CONFIG_64BIT
-												riscv: Allow to dynamically define VA_BITS

With 4-level page table folding at runtime, we don't know at compile time
the size of the virtual address space so we must set VA_BITS dynamically
so that sparsemem reserves the right amount of memory for struct pages.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:48 +00:00
+								#define KERN_VIRT_START	(-(BIT(VA_BITS)) + TASK_SIZE)
-												riscv: Add support to dump the kernel page tables

In a similar manner to arm64, x86, powerpc, etc., it can traverse all
page tables, and dump the page table layout with the memory types and
permissions.

Add a debugfs file at /sys/kernel/debug/kernel_page_tables to export
the page table layout to userspace.

Signed-off-by: Zong Li <zong.li@sifive.com>
Tested-by: Alexandre Ghiti <alex@ghiti.fr>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-03-12 02:58:35 +00:00
+								#else
 								#define KERN_VIRT_START	FIXADDR_START
 								#endif
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								/*
-												RISC-V: Fix FIXMAP area corruption on RV32 systems

Currently, various virtual memory areas of Linux RISC-V are organized
in increasing order of their virtual addresses is as follows:
1. User space area (This is lowest area and starts at 0x0)
2. FIXMAP area
3. VMALLOC area
4. Kernel area (This is highest area and starts at PAGE_OFFSET)

The maximum size of user space aread is represented by TASK_SIZE.

On RV32 systems, TASK_SIZE is defined as VMALLOC_START which causes the
user space area to overlap the FIXMAP area. This allows user space apps
to potentially corrupt the FIXMAP area and kernel OF APIs will crash
whenever they access corrupted FDT in the FIXMAP area.

On RV64 systems, TASK_SIZE is set to fixed 256GB and no other areas
happen to overlap so we don't see any FIXMAP area corruptions.

This patch fixes FIXMAP area corruption on RV32 systems by setting
TASK_SIZE to FIXADDR_START. We also move FIXADDR_TOP, FIXADDR_SIZE,
and FIXADDR_START defines to asm/pgtable.h so that we can avoid cyclic
header includes.

Signed-off-by: Anup Patel <anup.patel@wdc.com>
Tested-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-08-19 05:14:23 +00:00
+								 * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								 * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
-												riscv: Explicit comment about user virtual address space size

Define precisely the size of the user accessible virtual space size
for sv32/39/48 mmu types and explain why the whole virtual address
space is split into 2 equal chunks between kernel and user space.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:53 +00:00
+								 * Task size is:
-												RISC-V: mm: Update pgtable comment documentation

sv57 is supported in the kernel so pgtable.h should reflect that.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-4-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:03 +00:00
+								 * -        0x9fc00000	(~2.5GB) for RV32.
 								 * -      0x4000000000	( 256GB) for RV64 using SV39 mmu
 								 * -    0x800000000000	( 128TB) for RV64 using SV48 mmu
 								 * - 0x100000000000000	(  64PB) for RV64 using SV57 mmu
-												riscv: Explicit comment about user virtual address space size

Define precisely the size of the user accessible virtual space size
for sv32/39/48 mmu types and explain why the whole virtual address
space is split into 2 equal chunks between kernel and user space.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:53 +00:00
+								 *
 								 * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V
 								 * Instruction Set Manual Volume II: Privileged Architecture" states that
 								 * "load and store effective addresses, which are 64bits, must have bits
 								 * 63–48 all equal to bit 47, or else a page-fault exception will occur."
-												RISC-V: mm: Update pgtable comment documentation

sv57 is supported in the kernel so pgtable.h should reflect that.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20230809232218.849726-4-charlie@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2023-08-09 23:22:03 +00:00
+								 * Similarly for SV57, bits 63–57 must be equal to bit 56.
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								 */
 								#ifdef CONFIG_64BIT
-												riscv: compat: Support TASK_SIZE for compat mode

Make TASK_SIZE from const to dynamic detect TIF_32BIT flag
function. Refer to arm64 to implement DEFAULT_MAP_WINDOW_64 for
efi-stub.

Limit 32-bit compatible process in 0-2GB virtual address range
(which is enough for real scenarios), because it could avoid
address sign extend problem when 32-bit enter 64-bit and ease
software design.

The standard 32-bit TASK_SIZE is 0x9dc00000:FIXADDR_START, and
compared to a compatible 32-bit, it increases 476MB for the
application's virtual address.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Link: https://lore.kernel.org/r/20220405071314.3225832-11-guoren@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-04-05 07:13:04 +00:00
+								#define TASK_SIZE_64	(PGDIR_SIZE * PTRS_PER_PGD / 2)
 								#define TASK_SIZE_MIN	(PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
 								#ifdef CONFIG_COMPAT
 								#define TASK_SIZE_32	(_AC(0x80000000, UL) - PAGE_SIZE)
-												riscv: Replace direct thread flag check with is_compat_task()

There is some code that detects compat mode into a task by checking the
flag directly, and other code that check using the helper is_compat_task().

Since the helper already exists, use it instead of checking the flags
directly.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
Link: https://lore.kernel.org/r/20240103160024.70305-4-leobras@redhat.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2024-01-03 16:00:20 +00:00
+								#define TASK_SIZE	(is_compat_task() ? \
-												riscv: compat: Support TASK_SIZE for compat mode

Make TASK_SIZE from const to dynamic detect TIF_32BIT flag
function. Refer to arm64 to implement DEFAULT_MAP_WINDOW_64 for
efi-stub.

Limit 32-bit compatible process in 0-2GB virtual address range
(which is enough for real scenarios), because it could avoid
address sign extend problem when 32-bit enter 64-bit and ease
software design.

The standard 32-bit TASK_SIZE is 0x9dc00000:FIXADDR_START, and
compared to a compatible 32-bit, it increases 476MB for the
application's virtual address.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Link: https://lore.kernel.org/r/20220405071314.3225832-11-guoren@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2022-04-05 07:13:04 +00:00
+											 TASK_SIZE_32 : TASK_SIZE_64)
 								#else
 								#define TASK_SIZE	TASK_SIZE_64
 								#endif
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#else
-												riscv: Implement sv48 support

By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that offers
128TB of virtual address space to userspace and allows up to 64TB of
physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:51 +00:00
+								#define TASK_SIZE	FIXADDR_START
 								#define TASK_SIZE_MIN	TASK_SIZE
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#endif
-												riscv: add nommu support

The kernel runs in M-mode without using page tables, and thus can't run
bare metal without help from additional firmware.

Most of the patch is just stubbing out code not needed without page
tables, but there is an interesting detail in the signals implementation:

 - The normal RISC-V syscall ABI only implements rt_sigreturn as VDSO
   entry point, but the ELF VDSO is not supported for nommu Linux.
   We instead copy the code to call the syscall onto the stack.

In addition to enabling the nommu code a new defconfig for a small
kernel image that can run in nommu mode on qemu is also provided, to run
a kernel in qemu you can use the following command line:

qemu-system-riscv64 -smp 2 -m 64 -machine virt -nographic \
	-kernel arch/riscv/boot/loader \
	-drive file=rootfs.ext2,format=raw,id=hd0 \
	-device virtio-blk-device,drive=hd0

Contains contributions from Damien Le Moal <Damien.LeMoal@wdc.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anup Patel <anup@brainfault.org>
[paul.walmsley@sifive.com: updated to apply; add CONFIG_MMU guards
 around PCI_IOBASE definition to fix build issues; fixed checkpatch
 issues; move the PCI_IO_* and VMEMMAP address space macros along
 with the others; resolve sparse warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-28 12:10:41 +00:00
+								#else /* CONFIG_MMU */
-												riscv: Add pgprot_writecombine/device and PAGE_SHARED defination if NOMMU

Some drivers use PAGE_SHARED, pgprot_writecombine()/pgprot_device(),
add the defination to fix build error if NOMMU.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-05-11 02:19:54 +00:00
+								#define PAGE_SHARED		__pgprot(0)
-												riscv: add nommu support

The kernel runs in M-mode without using page tables, and thus can't run
bare metal without help from additional firmware.

Most of the patch is just stubbing out code not needed without page
tables, but there is an interesting detail in the signals implementation:

 - The normal RISC-V syscall ABI only implements rt_sigreturn as VDSO
   entry point, but the ELF VDSO is not supported for nommu Linux.
   We instead copy the code to call the syscall onto the stack.

In addition to enabling the nommu code a new defconfig for a small
kernel image that can run in nommu mode on qemu is also provided, to run
a kernel in qemu you can use the following command line:

qemu-system-riscv64 -smp 2 -m 64 -machine virt -nographic \
	-kernel arch/riscv/boot/loader \
	-drive file=rootfs.ext2,format=raw,id=hd0 \
	-device virtio-blk-device,drive=hd0

Contains contributions from Damien Le Moal <Damien.LeMoal@wdc.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anup Patel <anup@brainfault.org>
[paul.walmsley@sifive.com: updated to apply; add CONFIG_MMU guards
 around PCI_IOBASE definition to fix build issues; fixed checkpatch
 issues; move the PCI_IO_* and VMEMMAP address space macros along
 with the others; resolve sparse warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-28 12:10:41 +00:00
+								#define PAGE_KERNEL		__pgprot(0)
 								#define swapper_pg_dir		NULL
-												mm: switch the test_vmalloc module to use __vmalloc_node

No need to export the very low-level __vmalloc_node_range when the test
module can use a slightly higher level variant.

[akpm@linux-foundation.org: add missing `node' arg]
[akpm@linux-foundation.org: fix riscv nommu build]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Link: http://lkml.kernel.org/r/20200414131348.444715-26-hch@lst.de
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

											
										
										
											2020-06-02 04:51:57 +00:00
+								#define TASK_SIZE		0xffffffffUL
-												riscv: fix VMALLOC_START definition

When below config items are set, compiler complained:

--------------------
CONFIG_CRASH_CORE=y
CONFIG_KEXEC_CORE=y
CONFIG_CRASH_DUMP=y
......
-----------------------

-------------------------------------------------------------------
arch/riscv/kernel/crash_core.c: In function 'arch_crash_save_vmcoreinfo':
arch/riscv/kernel/crash_core.c:11:58: warning: format '%lx' expects argument of type 'long unsigned int', but argument 2 has type 'int' [-Wformat=]
11 |         vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
   |                                                        ~~^
   |                                                          |
   |                                                          long unsigned int
   |                                                        %x
----------------------------------------------------------------------

This is because on riscv macro VMALLOC_START has different type when
CONFIG_MMU is set or unset.

arch/riscv/include/asm/pgtable.h:
--------------------------------------------------

Changing it to _AC(0, UL) in case CONFIG_MMU=n can fix the warning.

Link: https://lkml.kernel.org/r/ZW7OsX4zQRA3mO4+@MiWiFi-R3L-srv
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>	# build-tested
Cc: Eric DeVolder <eric_devolder@yahoo.com>
Cc: Ignat Korchagin <ignat@cloudflare.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

											
										
										
											2023-12-05 03:02:55 +00:00
+								#define VMALLOC_START		_AC(0, UL)
-												mm: switch the test_vmalloc module to use __vmalloc_node

No need to export the very low-level __vmalloc_node_range when the test
module can use a slightly higher level variant.

[akpm@linux-foundation.org: add missing `node' arg]
[akpm@linux-foundation.org: fix riscv nommu build]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Link: http://lkml.kernel.org/r/20200414131348.444715-26-hch@lst.de
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

											
										
										
											2020-06-02 04:51:57 +00:00
+								#define VMALLOC_END		TASK_SIZE
-												riscv: add nommu support

The kernel runs in M-mode without using page tables, and thus can't run
bare metal without help from additional firmware.

Most of the patch is just stubbing out code not needed without page
tables, but there is an interesting detail in the signals implementation:

 - The normal RISC-V syscall ABI only implements rt_sigreturn as VDSO
   entry point, but the ELF VDSO is not supported for nommu Linux.
   We instead copy the code to call the syscall onto the stack.

In addition to enabling the nommu code a new defconfig for a small
kernel image that can run in nommu mode on qemu is also provided, to run
a kernel in qemu you can use the following command line:

qemu-system-riscv64 -smp 2 -m 64 -machine virt -nographic \
	-kernel arch/riscv/boot/loader \
	-drive file=rootfs.ext2,format=raw,id=hd0 \
	-device virtio-blk-device,drive=hd0

Contains contributions from Damien Le Moal <Damien.LeMoal@wdc.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anup Patel <anup@brainfault.org>
[paul.walmsley@sifive.com: updated to apply; add CONFIG_MMU guards
 around PCI_IOBASE definition to fix build issues; fixed checkpatch
 issues; move the PCI_IO_* and VMEMMAP address space macros along
 with the others; resolve sparse warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-28 12:10:41 +00:00
 								#endif /* !CONFIG_MMU */
-												riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-11 16:41:44 +00:00
+								extern char _start[];
-												RISC-V: enable XIP

Introduce XIP (eXecute In Place) support for RISC-V platforms.
It allows code to be executed directly from non-volatile storage
directly addressable by the CPU, such as QSPI NOR flash which can
be found on many RISC-V platforms. This makes way for significant
optimization of RAM footprint. The XIP kernel is not compressed
since it has to run directly from flash, so it will occupy more
space on the non-volatile storage. The physical flash address used
to link the kernel object files and for storing it has to be known
at compile time and is represented by a Kconfig option.

XIP on RISC-V will for the time being only work on MMU-enabled
kernels.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
[Alex: Rebase on top of "Move kernel mapping outside the linear mapping" ]
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: disable XIP for allyesconfig]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-13 06:35:14 +00:00
+								extern void *_dtb_early_va;
 								extern uintptr_t _dtb_early_pa;
 								#if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_MMU)
 								#define dtb_early_va	(*(void **)XIP_FIXUP(&_dtb_early_va))
 								#define dtb_early_pa	(*(uintptr_t *)XIP_FIXUP(&_dtb_early_pa))
 								#else
 								#define dtb_early_va	_dtb_early_va
 								#define dtb_early_pa	_dtb_early_pa
 								#endif /* CONFIG_XIP_KERNEL */
-												riscv: Implement sv48 support

By adding a new 4th level of page table, give the possibility to 64bit
kernel to address 2^48 bytes of virtual address: in practice, that offers
128TB of virtual address space to userspace and allows up to 64TB of
physical memory.

If the underlying hardware does not support sv48, we will automatically
fallback to a standard 3-level page table by folding the new PUD level into
PGDIR level. In order to detect HW capabilities at runtime, we
use SATP feature that ignores writes with an unsupported mode.

Signed-off-by: Alexandre Ghiti <alexandre.ghiti@canonical.com>
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>

											
										
										
											2021-12-06 10:46:51 +00:00
+								extern u64 satp_mode;
-												RISC-V: enable XIP

Introduce XIP (eXecute In Place) support for RISC-V platforms.
It allows code to be executed directly from non-volatile storage
directly addressable by the CPU, such as QSPI NOR flash which can
be found on many RISC-V platforms. This makes way for significant
optimization of RAM footprint. The XIP kernel is not compressed
since it has to run directly from flash, so it will occupy more
space on the non-volatile storage. The physical flash address used
to link the kernel object files and for storing it has to be known
at compile time and is represented by a Kconfig option.

XIP on RISC-V will for the time being only work on MMU-enabled
kernels.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
[Alex: Rebase on top of "Move kernel mapping outside the linear mapping" ]
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: disable XIP for allyesconfig]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2021-04-13 06:35:14 +00:00
-												riscv: add nommu support

The kernel runs in M-mode without using page tables, and thus can't run
bare metal without help from additional firmware.

Most of the patch is just stubbing out code not needed without page
tables, but there is an interesting detail in the signals implementation:

 - The normal RISC-V syscall ABI only implements rt_sigreturn as VDSO
   entry point, but the ELF VDSO is not supported for nommu Linux.
   We instead copy the code to call the syscall onto the stack.

In addition to enabling the nommu code a new defconfig for a small
kernel image that can run in nommu mode on qemu is also provided, to run
a kernel in qemu you can use the following command line:

qemu-system-riscv64 -smp 2 -m 64 -machine virt -nographic \
	-kernel arch/riscv/boot/loader \
	-drive file=rootfs.ext2,format=raw,id=hd0 \
	-device virtio-blk-device,drive=hd0

Contains contributions from Damien Le Moal <Damien.LeMoal@wdc.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anup Patel <anup@brainfault.org>
[paul.walmsley@sifive.com: updated to apply; add CONFIG_MMU guards
 around PCI_IOBASE definition to fix build issues; fixed checkpatch
 issues; move the PCI_IO_* and VMEMMAP address space macros along
 with the others; resolve sparse warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-28 12:10:41 +00:00
+								void paging_init(void);
-												riscv: Separate memory init from paging init

Currently, we perform some memory init functions in paging init. But,
that will be an issue for NUMA support where DT needs to be flattened
before numa initialization and memblock_present can only be called
after numa initialization.

Move memory initialization related functions to a separate function.

Signed-off-by: Atish Patra <atish.patra@wdc.com>
Reviewed-by: Greentime Hu <greentime.hu@sifive.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

											
										
										
											2020-11-19 00:38:27 +00:00
+								void misc_mem_init(void);
-												riscv: add nommu support

The kernel runs in M-mode without using page tables, and thus can't run
bare metal without help from additional firmware.

Most of the patch is just stubbing out code not needed without page
tables, but there is an interesting detail in the signals implementation:

 - The normal RISC-V syscall ABI only implements rt_sigreturn as VDSO
   entry point, but the ELF VDSO is not supported for nommu Linux.
   We instead copy the code to call the syscall onto the stack.

In addition to enabling the nommu code a new defconfig for a small
kernel image that can run in nommu mode on qemu is also provided, to run
a kernel in qemu you can use the following command line:

qemu-system-riscv64 -smp 2 -m 64 -machine virt -nographic \
	-kernel arch/riscv/boot/loader \
	-drive file=rootfs.ext2,format=raw,id=hd0 \
	-device virtio-blk-device,drive=hd0

Contains contributions from Damien Le Moal <Damien.LeMoal@wdc.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anup Patel <anup@brainfault.org>
[paul.walmsley@sifive.com: updated to apply; add CONFIG_MMU guards
 around PCI_IOBASE definition to fix build issues; fixed checkpatch
 issues; move the PCI_IO_* and VMEMMAP address space macros along
 with the others; resolve sparse warning]
Signed-off-by: Paul Walmsley <paul.walmsley@sifive.com>
											
										
										
											2019-10-28 12:10:41 +00:00
 								/*
 								 * ZERO_PAGE is a global shared page that is always zero,
 								 * used for zero-mapped memory areas, etc.
 								 */
 								extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 								#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
-												RISC-V: Paging and MMU

This patch contains code to manage the RISC-V MMU, including definitions
of the page tables and the page walking code.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>

											
										
										
											2017-07-11 01:06:09 +00:00
+								#endif /* !__ASSEMBLY__ */
 								#endif /* _ASM_RISCV_PGTABLE_H */