From 031e011d8b22346a6513c7879cbecd7105b4c11d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:26 +0100
Subject: [PATCH 001/134] arm64: mm: Move PCI I/O emulation region above the
 vmemmap region

Move the PCI I/O region above the vmemmap region in the kernel's VA
space. This will permit us to reclaim the lower part of the vmemmap
region for vmalloc/vmap allocations when running a 52-bit VA capable
build on a 48-bit VA capable system.

Also, given that PCI_IO_START is derived from VMEMMAP_END, use that
symbolic constant directly in ptdump rather than deriving it from
VMEMMAP_START and VMEMMAP_SIZE, as those definitions will change in
subsequent patches.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-10-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/include/asm/memory.h | 4 ++--
 arch/arm64/mm/ptdump.c          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index d82305ab420f..b04fa8a8a262 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -49,8 +49,8 @@
 #define MODULES_VSIZE		(SZ_2G)
 #define VMEMMAP_START		(-(UL(1) << (VA_BITS - VMEMMAP_SHIFT)))
 #define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
-#define PCI_IO_END		(VMEMMAP_START - SZ_8M)
-#define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
+#define PCI_IO_START		(VMEMMAP_END + SZ_8M)
+#define PCI_IO_END		(PCI_IO_START + PCI_IO_SIZE)
 #define FIXADDR_TOP		(VMEMMAP_START - SZ_32M)
 
 #if VA_BITS > 48
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index e305b6593c4e..46acb2a24da0 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -47,10 +47,10 @@ static struct addr_marker address_markers[] = {
 	{ VMALLOC_END,			"vmalloc() end" },
 	{ FIXADDR_TOT_START,		"Fixmap start" },
 	{ FIXADDR_TOP,			"Fixmap end" },
+	{ VMEMMAP_START,		"vmemmap start" },
+	{ VMEMMAP_END,			"vmemmap end" },
 	{ PCI_IO_START,			"PCI I/O start" },
 	{ PCI_IO_END,			"PCI I/O end" },
-	{ VMEMMAP_START,		"vmemmap start" },
-	{ VMEMMAP_START + VMEMMAP_SIZE,	"vmemmap end" },
 	{ -1,				NULL },
 };
 

From b730b0f2b1fcfbdaed816152cc71993fd708aa11 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:27 +0100
Subject: [PATCH 002/134] arm64: mm: Move fixmap region above vmemmap region

Move the fixmap region above the vmemmap region, so that the start of
the vmemmap delineates the end of the region available for vmalloc and
vmap allocations and the randomized placement of the kernel and modules.

In a subsequent patch, we will take advantage of this to reclaim most of
the vmemmap area when running a 52-bit VA capable build with 52-bit
virtual addressing disabled at runtime.

Note that the existing guard region of 256 MiB covers the fixmap and PCI
I/O regions as well, so we can reduce it 8 MiB, which is what we use in
other places too.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-11-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/include/asm/memory.h  | 2 +-
 arch/arm64/include/asm/pgtable.h | 2 +-
 arch/arm64/mm/fixmap.c           | 3 +++
 arch/arm64/mm/ptdump.c           | 4 ++--
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index b04fa8a8a262..f3be3ea74138 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -51,7 +51,7 @@
 #define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
 #define PCI_IO_START		(VMEMMAP_END + SZ_8M)
 #define PCI_IO_END		(PCI_IO_START + PCI_IO_SIZE)
-#define FIXADDR_TOP		(VMEMMAP_START - SZ_32M)
+#define FIXADDR_TOP		(-UL(SZ_8M))
 
 #if VA_BITS > 48
 #define VA_BITS_MIN		(48)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751..2aa2e3c961d7 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -22,7 +22,7 @@
  *	and fixed mappings
  */
 #define VMALLOC_START		(MODULES_END)
-#define VMALLOC_END		(VMEMMAP_START - SZ_256M)
+#define VMALLOC_END		(VMEMMAP_START - SZ_8M)
 
 #define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index c0a3301203bd..6fc17b2e1714 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -16,6 +16,9 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
+/* ensure that the fixmap region does not grow down into the PCI I/O region */
+static_assert(FIXADDR_TOT_START > PCI_IO_END);
+
 #define NR_BM_PTE_TABLES \
 	SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT)
 #define NR_BM_PMD_TABLES \
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 46acb2a24da0..a929b5a321db 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -45,12 +45,12 @@ static struct addr_marker address_markers[] = {
 	{ MODULES_END,			"Modules end" },
 	{ VMALLOC_START,		"vmalloc() area" },
 	{ VMALLOC_END,			"vmalloc() end" },
-	{ FIXADDR_TOT_START,		"Fixmap start" },
-	{ FIXADDR_TOP,			"Fixmap end" },
 	{ VMEMMAP_START,		"vmemmap start" },
 	{ VMEMMAP_END,			"vmemmap end" },
 	{ PCI_IO_START,			"PCI I/O start" },
 	{ PCI_IO_END,			"PCI I/O end" },
+	{ FIXADDR_TOT_START,		"Fixmap start" },
+	{ FIXADDR_TOP,			"Fixmap end" },
 	{ -1,				NULL },
 };
 

From 34f879fbe461dcdcaecb8bece3266dbea3a745ba Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:28 +0100
Subject: [PATCH 003/134] arm64: ptdump: Allow all region boundaries to be
 defined at boot time

Rework the way the address_markers array is populated so that we can
tolerate values that are not compile time constants generally, rather
than keeping track manually of the array indexes in question, and poking
new values into them manually. This will be needed for VMALLOC_END,
which will cease to be a compile time constant after a subsequent patch.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-12-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/mm/ptdump.c | 54 +++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index a929b5a321db..66ccb8d6997e 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -26,34 +26,6 @@
 #include <asm/ptdump.h>
 
 
-enum address_markers_idx {
-	PAGE_OFFSET_NR = 0,
-	PAGE_END_NR,
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-	KASAN_START_NR,
-#endif
-};
-
-static struct addr_marker address_markers[] = {
-	{ PAGE_OFFSET,			"Linear Mapping start" },
-	{ 0 /* PAGE_END */,		"Linear Mapping end" },
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-	{ 0 /* KASAN_SHADOW_START */,	"Kasan shadow start" },
-	{ KASAN_SHADOW_END,		"Kasan shadow end" },
-#endif
-	{ MODULES_VADDR,		"Modules start" },
-	{ MODULES_END,			"Modules end" },
-	{ VMALLOC_START,		"vmalloc() area" },
-	{ VMALLOC_END,			"vmalloc() end" },
-	{ VMEMMAP_START,		"vmemmap start" },
-	{ VMEMMAP_END,			"vmemmap end" },
-	{ PCI_IO_START,			"PCI I/O start" },
-	{ PCI_IO_END,			"PCI I/O end" },
-	{ FIXADDR_TOT_START,		"Fixmap start" },
-	{ FIXADDR_TOP,			"Fixmap end" },
-	{ -1,				NULL },
-};
-
 #define pt_dump_seq_printf(m, fmt, args...)	\
 ({						\
 	if (m)					\
@@ -339,9 +311,8 @@ static void __init ptdump_initialize(void)
 				pg_level[i].mask |= pg_level[i].bits[j].mask;
 }
 
-static struct ptdump_info kernel_ptdump_info = {
+static struct ptdump_info kernel_ptdump_info __ro_after_init = {
 	.mm		= &init_mm,
-	.markers	= address_markers,
 	.base_addr	= PAGE_OFFSET,
 };
 
@@ -375,10 +346,29 @@ void ptdump_check_wx(void)
 
 static int __init ptdump_init(void)
 {
-	address_markers[PAGE_END_NR].start_address = PAGE_END;
+	struct addr_marker m[] = {
+		{ PAGE_OFFSET,		"Linear Mapping start" },
+		{ PAGE_END,		"Linear Mapping end" },
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-	address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START;
+		{ KASAN_SHADOW_START,   "Kasan shadow start" },
+		{ KASAN_SHADOW_END,     "Kasan shadow end" },
 #endif
+		{ MODULES_VADDR,	"Modules start" },
+		{ MODULES_END,		"Modules end" },
+		{ VMALLOC_START,	"vmalloc() area" },
+		{ VMALLOC_END,		"vmalloc() end" },
+		{ VMEMMAP_START,	"vmemmap start" },
+		{ VMEMMAP_END,		"vmemmap end" },
+		{ PCI_IO_START,		"PCI I/O start" },
+		{ PCI_IO_END,		"PCI I/O end" },
+		{ FIXADDR_TOT_START,    "Fixmap start" },
+		{ FIXADDR_TOP,	        "Fixmap end" },
+		{ -1,			NULL },
+	};
+	static struct addr_marker address_markers[ARRAY_SIZE(m)] __ro_after_init;
+
+	kernel_ptdump_info.markers = memcpy(address_markers, m, sizeof(m));
+
 	ptdump_initialize();
 	ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables");
 	return 0;

From f9cca2444187f46e0bc90dfc3a5feb7154dbf38d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:29 +0100
Subject: [PATCH 004/134] arm64: ptdump: Discover start of vmemmap region at
 runtime

We will soon reclaim the part of the vmemmap region that covers VA space
that is not addressable by the hardware. To avoid confusion, ensure that
the 'vmemmap start' marker points at the start of the region that is
actually being used for the struct page array, rather than the start of
the region we set aside for it at build time.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-13-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/mm/ptdump.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 66ccb8d6997e..5f0849528ccf 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -346,6 +346,8 @@ void ptdump_check_wx(void)
 
 static int __init ptdump_init(void)
 {
+	u64 page_offset = _PAGE_OFFSET(vabits_actual);
+	u64 vmemmap_start = (u64)virt_to_page((void *)page_offset);
 	struct addr_marker m[] = {
 		{ PAGE_OFFSET,		"Linear Mapping start" },
 		{ PAGE_END,		"Linear Mapping end" },
@@ -357,7 +359,7 @@ static int __init ptdump_init(void)
 		{ MODULES_END,		"Modules end" },
 		{ VMALLOC_START,	"vmalloc() area" },
 		{ VMALLOC_END,		"vmalloc() end" },
-		{ VMEMMAP_START,	"vmemmap start" },
+		{ vmemmap_start,	"vmemmap start" },
 		{ VMEMMAP_END,		"vmemmap end" },
 		{ PCI_IO_START,		"PCI I/O start" },
 		{ PCI_IO_END,		"PCI I/O end" },

From 32697ff38287bb9f6c7ee1b04656a677b62496a6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:30 +0100
Subject: [PATCH 005/134] arm64: vmemmap: Avoid base2 order of struct page size
 to dimension region

The placement and size of the vmemmap region in the kernel virtual
address space is currently derived from the base2 order of the size of a
struct page. This makes for nicely aligned constants with lots of
leading 0xf and trailing 0x0 digits, but given that the actual struct
pages are indexed as an ordinary array, this resulting region is
severely overdimensioned when the size of a struct page is just over a
power of 2.

This doesn't matter today, but once we enable 52-bit virtual addressing
for 4k pages configurations, the vmemmap region may take up almost half
of the upper VA region with the current struct page upper bound at 64
bytes. And once we enable KMSAN or other features that push the size of
a struct page over 64 bytes, we will run out of VMALLOC space entirely.

So instead, let's derive the region size from the actual size of a
struct page, and place the entire region 1 GB from the top of the VA
space, where it still doesn't share any lower level translation table
entries with the fixmap.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-14-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/include/asm/memory.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index f3be3ea74138..60904a6c4b42 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -30,8 +30,8 @@
  * keep a constant PAGE_OFFSET and "fallback" to using the higher end
  * of the VMEMMAP where 52-bit support is not available in hardware.
  */
-#define VMEMMAP_SHIFT	(PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT)
-#define VMEMMAP_SIZE	((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) >> VMEMMAP_SHIFT)
+#define VMEMMAP_RANGE	(_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET)
+#define VMEMMAP_SIZE	((VMEMMAP_RANGE >> PAGE_SHIFT) * sizeof(struct page))
 
 /*
  * PAGE_OFFSET - the virtual address of the start of the linear map, at the
@@ -47,8 +47,8 @@
 #define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
 #define MODULES_VADDR		(_PAGE_END(VA_BITS_MIN))
 #define MODULES_VSIZE		(SZ_2G)
-#define VMEMMAP_START		(-(UL(1) << (VA_BITS - VMEMMAP_SHIFT)))
-#define VMEMMAP_END		(VMEMMAP_START + VMEMMAP_SIZE)
+#define VMEMMAP_START		(VMEMMAP_END - VMEMMAP_SIZE)
+#define VMEMMAP_END		(-UL(SZ_1G))
 #define PCI_IO_START		(VMEMMAP_END + SZ_8M)
 #define PCI_IO_END		(PCI_IO_START + PCI_IO_SIZE)
 #define FIXADDR_TOP		(-UL(SZ_8M))

From d432b8d57c0c41873f1b8743203776baeb5778b6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:31 +0100
Subject: [PATCH 006/134] arm64: mm: Reclaim unused vmemmap region for vmalloc
 use

The vmemmap array is statically sized based on the maximum supported
size of the virtual address space, but it is located inside the upper VA
region, which is statically sized based on the *minimum* supported size
of the VA space. This doesn't matter much when using 64k pages, which is
the only configuration that currently supports 52-bit virtual
addressing.

However, upcoming LPA2 support will change this picture somewhat, as in
that case, the vmemmap array will take up more than 25% of the upper VA
region when using 4k pages. Given that most of this space is never used
when running on a system that does not support 52-bit virtual
addressing, let's reclaim the unused vmemmap area in that case.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-15-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2aa2e3c961d7..522c21348ae8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -18,11 +18,15 @@
  * VMALLOC range.
  *
  * VMALLOC_START: beginning of the kernel vmalloc space
- * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space
- *	and fixed mappings
+ * VMALLOC_END: extends to the available space below vmemmap
  */
 #define VMALLOC_START		(MODULES_END)
+#if VA_BITS == VA_BITS_MIN
 #define VMALLOC_END		(VMEMMAP_START - SZ_8M)
+#else
+#define VMEMMAP_UNUSED_NPAGES	((_PAGE_OFFSET(vabits_actual) - PAGE_OFFSET) >> PAGE_SHIFT)
+#define VMALLOC_END		(VMEMMAP_START + VMEMMAP_UNUSED_NPAGES * sizeof(struct page) - SZ_8M)
+#endif
 
 #define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 

From 3567fa63cb5680d3e1e8375c547a0e305c8a0ff5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 13 Dec 2023 09:40:32 +0100
Subject: [PATCH 007/134] arm64: kaslr: Adjust randomization range dynamically

Currently, we base the KASLR randomization range on a rough estimate of
the available space in the upper VA region: the lower 1/4th has the
module region and the upper 1/4th has the fixmap, vmemmap and PCI I/O
ranges, and so we pick a random location in the remaining space in the
middle.

Once we enable support for 5-level paging with 4k pages, this no longer
works: the vmemmap region, being dimensioned to cover a 52-bit linear
region, takes up so much space in the upper VA region (the size of which
is based on a 48-bit VA space for compatibility with non-LVA hardware)
that the region above the vmalloc region takes up more than a quarter of
the available space.

So instead of a heuristic, let's derive the randomization range from the
actual boundaries of the vmalloc region.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20231213084024.2367360-16-ardb@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/kernel/image-vars.h     |  2 ++
 arch/arm64/kernel/pi/kaslr_early.c | 11 ++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 5e4dc72ab1bd..e931ce078a00 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -36,6 +36,8 @@ PROVIDE(__pi___memcpy			= __pi_memcpy);
 PROVIDE(__pi___memmove			= __pi_memmove);
 PROVIDE(__pi___memset			= __pi_memset);
 
+PROVIDE(__pi_vabits_actual		= vabits_actual);
+
 #ifdef CONFIG_KVM
 
 /*
diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c
index 17bff6e399e4..b9e0bb4bc6a9 100644
--- a/arch/arm64/kernel/pi/kaslr_early.c
+++ b/arch/arm64/kernel/pi/kaslr_early.c
@@ -14,6 +14,7 @@
 
 #include <asm/archrandom.h>
 #include <asm/memory.h>
+#include <asm/pgtable.h>
 
 /* taken from lib/string.c */
 static char *__strstr(const char *s1, const char *s2)
@@ -87,7 +88,7 @@ static u64 get_kaslr_seed(void *fdt)
 
 asmlinkage u64 kaslr_early_init(void *fdt)
 {
-	u64 seed;
+	u64 seed, range;
 
 	if (is_kaslr_disabled_cmdline(fdt))
 		return 0;
@@ -102,9 +103,9 @@ asmlinkage u64 kaslr_early_init(void *fdt)
 	/*
 	 * OK, so we are proceeding with KASLR enabled. Calculate a suitable
 	 * kernel image offset from the seed. Let's place the kernel in the
-	 * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of
-	 * the lower and upper quarters to avoid colliding with other
-	 * allocations.
+	 * 'middle' half of the VMALLOC area, and stay clear of the lower and
+	 * upper quarters to avoid colliding with other allocations.
 	 */
-	return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0));
+	range = (VMALLOC_END - KIMAGE_VADDR) / 2;
+	return range / 2 + (((__uint128_t)range * seed) >> 64);
 }

From 3bc9d71775eef9f2642cc6e85ee7593cae2bdbbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:01 +0100
Subject: [PATCH 008/134] perf: alibaba_uncore_drw: Convert to platform remove
 callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/e6dd47f791ddcc4cc6f7a80efcede245528220e6.1702648124.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/alibaba_uncore_drw_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index 19d459a36be5..a9277dcf90ce 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -729,7 +729,7 @@ static int ali_drw_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int ali_drw_pmu_remove(struct platform_device *pdev)
+static void ali_drw_pmu_remove(struct platform_device *pdev)
 {
 	struct ali_drw_pmu *drw_pmu = platform_get_drvdata(pdev);
 
@@ -739,8 +739,6 @@ static int ali_drw_pmu_remove(struct platform_device *pdev)
 
 	ali_drw_pmu_uninit_irq(drw_pmu);
 	perf_pmu_unregister(&drw_pmu->pmu);
-
-	return 0;
 }
 
 static int ali_drw_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
@@ -795,7 +793,7 @@ static struct platform_driver ali_drw_pmu_driver = {
 		   .acpi_match_table = ali_drw_acpi_match,
 		   },
 	.probe = ali_drw_pmu_probe,
-	.remove = ali_drw_pmu_remove,
+	.remove_new = ali_drw_pmu_remove,
 };
 
 static int __init ali_drw_pmu_init(void)

From 94843f269abd4a9821848380061d05f0600b9d10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:02 +0100
Subject: [PATCH 009/134] perf: amlogic: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/33dbadf246eb323edd9e09ac744111216c167a55.1702648124.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/amlogic/meson_g12_ddr_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
index 15d52ab3276a..99cc791892bc 100644
--- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c
+++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
@@ -355,11 +355,9 @@ static int g12_ddr_pmu_probe(struct platform_device *pdev)
 	return meson_ddr_pmu_create(pdev);
 }
 
-static int g12_ddr_pmu_remove(struct platform_device *pdev)
+static void g12_ddr_pmu_remove(struct platform_device *pdev)
 {
 	meson_ddr_pmu_remove(pdev);
-
-	return 0;
 }
 
 static const struct of_device_id meson_ddr_pmu_dt_match[] = {
@@ -381,7 +379,7 @@ MODULE_DEVICE_TABLE(of, meson_ddr_pmu_dt_match);
 
 static struct platform_driver g12_ddr_pmu_driver = {
 	.probe = g12_ddr_pmu_probe,
-	.remove = g12_ddr_pmu_remove,
+	.remove_new = g12_ddr_pmu_remove,
 
 	.driver = {
 		.name = "meson-g12-ddr-pmu",

From 4df3bddf8707ae74bc5c8eb54e44a588a01e02e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:03 +0100
Subject: [PATCH 010/134] perf: arm-cci: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/70b581d58cfffdccb9fb3ed17bf3220c00f8033f.1702648124.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cci.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 61de861eaf91..6be03f81ae5d 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -1697,16 +1697,14 @@ error_pmu_init:
 	return ret;
 }
 
-static int cci_pmu_remove(struct platform_device *pdev)
+static void cci_pmu_remove(struct platform_device *pdev)
 {
 	if (!g_cci_pmu)
-		return 0;
+		return;
 
 	cpuhp_remove_state(CPUHP_AP_PERF_ARM_CCI_ONLINE);
 	perf_pmu_unregister(&g_cci_pmu->pmu);
 	g_cci_pmu = NULL;
-
-	return 0;
 }
 
 static struct platform_driver cci_pmu_driver = {
@@ -1716,7 +1714,7 @@ static struct platform_driver cci_pmu_driver = {
 		   .suppress_bind_attrs = true,
 		  },
 	.probe = cci_pmu_probe,
-	.remove = cci_pmu_remove,
+	.remove_new = cci_pmu_remove,
 };
 
 module_platform_driver(cci_pmu_driver);

From 0767f1a4853239bd5688d93e2e0d29be9d58e2d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:04 +0100
Subject: [PATCH 011/134] perf: arm-ccn: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/1cae5f0c4693333c91d28a09388bdb8bfcc25d0b.1702648124.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-ccn.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 728d13d8e98a..641471bd5eff 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -1515,13 +1515,11 @@ static int arm_ccn_probe(struct platform_device *pdev)
 	return arm_ccn_pmu_init(ccn);
 }
 
-static int arm_ccn_remove(struct platform_device *pdev)
+static void arm_ccn_remove(struct platform_device *pdev)
 {
 	struct arm_ccn *ccn = platform_get_drvdata(pdev);
 
 	arm_ccn_pmu_cleanup(ccn);
-
-	return 0;
 }
 
 static const struct of_device_id arm_ccn_match[] = {
@@ -1539,7 +1537,7 @@ static struct platform_driver arm_ccn_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = arm_ccn_probe,
-	.remove = arm_ccn_remove,
+	.remove_new = arm_ccn_remove,
 };
 
 static int __init arm_ccn_init(void)

From 3909cb3b5f8dee759622697ec1d3540f6d35c002 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:05 +0100
Subject: [PATCH 012/134] perf: arm-cmn: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/8698ca612e17292f8a8bbb2d1c0f6be4b2053da7.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index c584165b13ba..ac7a953fe902 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -2504,7 +2504,7 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int arm_cmn_remove(struct platform_device *pdev)
+static void arm_cmn_remove(struct platform_device *pdev)
 {
 	struct arm_cmn *cmn = platform_get_drvdata(pdev);
 
@@ -2513,7 +2513,6 @@ static int arm_cmn_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
 	debugfs_remove(cmn->debug);
-	return 0;
 }
 
 #ifdef CONFIG_OF
@@ -2544,7 +2543,7 @@ static struct platform_driver arm_cmn_driver = {
 		.acpi_match_table = ACPI_PTR(arm_cmn_acpi_match),
 	},
 	.probe = arm_cmn_probe,
-	.remove = arm_cmn_remove,
+	.remove_new = arm_cmn_remove,
 };
 
 static int __init arm_cmn_init(void)

From 79dc1570b322dbbbbfd04037113bcf3f83385025 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:06 +0100
Subject: [PATCH 013/134] perf: arm_cspmu: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/9ff5a467569dd51b2fc44e11594ad5db7ea15f57.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 50b89b989ce7..3513bf50fefa 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -1252,14 +1252,12 @@ static int arm_cspmu_device_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int arm_cspmu_device_remove(struct platform_device *pdev)
+static void arm_cspmu_device_remove(struct platform_device *pdev)
 {
 	struct arm_cspmu *cspmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&cspmu->pmu);
 	cpuhp_state_remove_instance(arm_cspmu_cpuhp_state, &cspmu->cpuhp_node);
-
-	return 0;
 }
 
 static const struct platform_device_id arm_cspmu_id[] = {
@@ -1274,7 +1272,7 @@ static struct platform_driver arm_cspmu_driver = {
 			.suppress_bind_attrs = true,
 		},
 	.probe = arm_cspmu_device_probe,
-	.remove = arm_cspmu_device_remove,
+	.remove_new = arm_cspmu_device_remove,
 	.id_table = arm_cspmu_id,
 };
 

From ca1e01c8d34a8d5290b7b32f88d0e04207b1d4e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:07 +0100
Subject: [PATCH 014/134] perf: arm_dmc620: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20cc24ede88f5e000991dfe6f4cf1222b819e337.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_dmc620_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 30cea6859574..8a81be2dd5ec 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -724,7 +724,7 @@ out_teardown_dev:
 	return ret;
 }
 
-static int dmc620_pmu_device_remove(struct platform_device *pdev)
+static void dmc620_pmu_device_remove(struct platform_device *pdev)
 {
 	struct dmc620_pmu *dmc620_pmu = platform_get_drvdata(pdev);
 
@@ -732,8 +732,6 @@ static int dmc620_pmu_device_remove(struct platform_device *pdev)
 
 	/* perf will synchronise RCU before devres can free dmc620_pmu */
 	perf_pmu_unregister(&dmc620_pmu->pmu);
-
-	return 0;
 }
 
 static const struct acpi_device_id dmc620_acpi_match[] = {
@@ -748,7 +746,7 @@ static struct platform_driver dmc620_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe	= dmc620_pmu_device_probe,
-	.remove	= dmc620_pmu_device_remove,
+	.remove_new = dmc620_pmu_device_remove,
 };
 
 static int __init dmc620_pmu_init(void)

From 02d77ac1ac7ebcad5a92dd392166636194faec60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:08 +0100
Subject: [PATCH 015/134] perf: arm_dsu: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/1eda5e216afcb0e26a50e9be112d4514ffd0844a.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_dsu_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index 7ec4498e312f..bae3ca37f846 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -774,14 +774,12 @@ static int dsu_pmu_device_probe(struct platform_device *pdev)
 	return rc;
 }
 
-static int dsu_pmu_device_remove(struct platform_device *pdev)
+static void dsu_pmu_device_remove(struct platform_device *pdev)
 {
 	struct dsu_pmu *dsu_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&dsu_pmu->pmu);
 	cpuhp_state_remove_instance(dsu_pmu_cpuhp_state, &dsu_pmu->cpuhp_node);
-
-	return 0;
 }
 
 static const struct of_device_id dsu_pmu_of_match[] = {
@@ -806,7 +804,7 @@ static struct platform_driver dsu_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = dsu_pmu_device_probe,
-	.remove = dsu_pmu_device_remove,
+	.remove_new = dsu_pmu_device_remove,
 };
 
 static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)

From d67c3a61f063ebb582ed807aaf9bf85538d05db6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:09 +0100
Subject: [PATCH 016/134] perf: arm_smmuv3: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/23bfd1a73ce819ffce6137c237608684a3cdfda6.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 6303b82566f9..af3ab3eb944e 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -965,14 +965,12 @@ out_unregister:
 	return err;
 }
 
-static int smmu_pmu_remove(struct platform_device *pdev)
+static void smmu_pmu_remove(struct platform_device *pdev)
 {
 	struct smmu_pmu *smmu_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&smmu_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &smmu_pmu->node);
-
-	return 0;
 }
 
 static void smmu_pmu_shutdown(struct platform_device *pdev)
@@ -997,7 +995,7 @@ static struct platform_driver smmu_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = smmu_pmu_probe,
-	.remove = smmu_pmu_remove,
+	.remove_new = smmu_pmu_remove,
 	.shutdown = smmu_pmu_shutdown,
 };
 

From e63b3aef186b254f2a58263429a28caf65c5ad8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:10 +0100
Subject: [PATCH 017/134] perf: arm_spe: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/abfedc224eca7f4960b7ddfb6daedd47a3699ca5.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_spe_pmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index b622d75d8c9e..35f0de03416f 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -1263,14 +1263,13 @@ out_free_handle:
 	return ret;
 }
 
-static int arm_spe_pmu_device_remove(struct platform_device *pdev)
+static void arm_spe_pmu_device_remove(struct platform_device *pdev)
 {
 	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
 
 	arm_spe_pmu_perf_destroy(spe_pmu);
 	arm_spe_pmu_dev_teardown(spe_pmu);
 	free_percpu(spe_pmu->handle);
-	return 0;
 }
 
 static struct platform_driver arm_spe_pmu_driver = {
@@ -1281,7 +1280,7 @@ static struct platform_driver arm_spe_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe	= arm_spe_pmu_device_probe,
-	.remove	= arm_spe_pmu_device_remove,
+	.remove_new = arm_spe_pmu_device_remove,
 };
 
 static int __init arm_spe_pmu_init(void)

From 1bb639382d3fe5498afdfcd64ba350b2075a2ee4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:11 +0100
Subject: [PATCH 018/134] perf: fsl_imx8_ddr: Convert to platform remove
 callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/c5b76bf352385d8ef6211ee8c43352c74eee064d.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/fsl_imx8_ddr_perf.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 7dbfaee372c7..4e8fa5a48fcf 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -826,7 +826,7 @@ cpuhp_state_err:
 	return ret;
 }
 
-static int ddr_perf_remove(struct platform_device *pdev)
+static void ddr_perf_remove(struct platform_device *pdev)
 {
 	struct ddr_pmu *pmu = platform_get_drvdata(pdev);
 
@@ -836,7 +836,6 @@ static int ddr_perf_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&pmu->pmu);
 
 	ida_free(&ddr_ida, pmu->id);
-	return 0;
 }
 
 static struct platform_driver imx_ddr_pmu_driver = {
@@ -846,7 +845,7 @@ static struct platform_driver imx_ddr_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe          = ddr_perf_probe,
-	.remove         = ddr_perf_remove,
+	.remove_new     = ddr_perf_remove,
 };
 
 module_platform_driver(imx_ddr_pmu_driver);

From 78da2a93b55b647e0770c6394cc938f6918db124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:12 +0100
Subject: [PATCH 019/134] perf: fsl_imx9_ddr: Convert to platform remove
 callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/bd12035ca467d7f4cd5edcfd6febda56600caacd.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/fsl_imx9_ddr_perf.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c
index 9685645bfe04..72c2d3074cde 100644
--- a/drivers/perf/fsl_imx9_ddr_perf.c
+++ b/drivers/perf/fsl_imx9_ddr_perf.c
@@ -679,7 +679,7 @@ format_string_err:
 	return ret;
 }
 
-static int ddr_perf_remove(struct platform_device *pdev)
+static void ddr_perf_remove(struct platform_device *pdev)
 {
 	struct ddr_pmu *pmu = platform_get_drvdata(pdev);
 
@@ -689,8 +689,6 @@ static int ddr_perf_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&pmu->pmu);
 
 	ida_free(&ddr_ida, pmu->id);
-
-	return 0;
 }
 
 static struct platform_driver imx_ddr_pmu_driver = {
@@ -700,7 +698,7 @@ static struct platform_driver imx_ddr_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe          = ddr_perf_probe,
-	.remove         = ddr_perf_remove,
+	.remove_new     = ddr_perf_remove,
 };
 module_platform_driver(imx_ddr_pmu_driver);
 

From e07486a832bba776f298082bfd964951e9357cdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:13 +0100
Subject: [PATCH 020/134] perf: hisilicon: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert these drivers from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/33a8be0641b9447469fb7f6af0a10fb65efa97a3.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c  | 5 ++---
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 5 ++---
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c  | 5 ++---
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c  | 5 ++---
 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c   | 5 ++---
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 5 ++---
 6 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
index 40f1bc9f9b91..0e923f94fa5b 100644
--- a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
@@ -341,7 +341,7 @@ static int hisi_cpa_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int hisi_cpa_pmu_remove(struct platform_device *pdev)
+static void hisi_cpa_pmu_remove(struct platform_device *pdev)
 {
 	struct hisi_pmu *cpa_pmu = platform_get_drvdata(pdev);
 
@@ -349,7 +349,6 @@ static int hisi_cpa_pmu_remove(struct platform_device *pdev)
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE,
 					    &cpa_pmu->node);
 	hisi_cpa_pmu_enable_pm(cpa_pmu);
-	return 0;
 }
 
 static struct platform_driver hisi_cpa_pmu_driver = {
@@ -359,7 +358,7 @@ static struct platform_driver hisi_cpa_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_cpa_pmu_probe,
-	.remove = hisi_cpa_pmu_remove,
+	.remove_new = hisi_cpa_pmu_remove,
 };
 
 static int __init hisi_cpa_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
index ffb039d05d07..b804e3738113 100644
--- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -531,14 +531,13 @@ static int hisi_ddrc_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int hisi_ddrc_pmu_remove(struct platform_device *pdev)
+static void hisi_ddrc_pmu_remove(struct platform_device *pdev)
 {
 	struct hisi_pmu *ddrc_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&ddrc_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
 					    &ddrc_pmu->node);
-	return 0;
 }
 
 static struct platform_driver hisi_ddrc_pmu_driver = {
@@ -548,7 +547,7 @@ static struct platform_driver hisi_ddrc_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_ddrc_pmu_probe,
-	.remove = hisi_ddrc_pmu_remove,
+	.remove_new = hisi_ddrc_pmu_remove,
 };
 
 static int __init hisi_ddrc_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 15caf99e1eef..21e69b1cdd4d 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -534,14 +534,13 @@ static int hisi_hha_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int hisi_hha_pmu_remove(struct platform_device *pdev)
+static void hisi_hha_pmu_remove(struct platform_device *pdev)
 {
 	struct hisi_pmu *hha_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&hha_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
 					    &hha_pmu->node);
-	return 0;
 }
 
 static struct platform_driver hisi_hha_pmu_driver = {
@@ -551,7 +550,7 @@ static struct platform_driver hisi_hha_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_hha_pmu_probe,
-	.remove = hisi_hha_pmu_remove,
+	.remove_new = hisi_hha_pmu_remove,
 };
 
 static int __init hisi_hha_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index 794dbcd19b7a..51ba76871097 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -568,14 +568,13 @@ static int hisi_l3c_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int hisi_l3c_pmu_remove(struct platform_device *pdev)
+static void hisi_l3c_pmu_remove(struct platform_device *pdev)
 {
 	struct hisi_pmu *l3c_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&l3c_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 					    &l3c_pmu->node);
-	return 0;
 }
 
 static struct platform_driver hisi_l3c_pmu_driver = {
@@ -585,7 +584,7 @@ static struct platform_driver hisi_l3c_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_l3c_pmu_probe,
-	.remove = hisi_l3c_pmu_remove,
+	.remove_new = hisi_l3c_pmu_remove,
 };
 
 static int __init hisi_l3c_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index 797cf201996a..3cdb35c741f9 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -514,14 +514,13 @@ static int hisi_pa_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int hisi_pa_pmu_remove(struct platform_device *pdev)
+static void hisi_pa_pmu_remove(struct platform_device *pdev)
 {
 	struct hisi_pmu *pa_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&pa_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 					    &pa_pmu->node);
-	return 0;
 }
 
 static const struct acpi_device_id hisi_pa_pmu_acpi_match[] = {
@@ -539,7 +538,7 @@ static struct platform_driver hisi_pa_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_pa_pmu_probe,
-	.remove = hisi_pa_pmu_remove,
+	.remove_new = hisi_pa_pmu_remove,
 };
 
 static int __init hisi_pa_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index e706ca567676..765bbd61db26 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -460,14 +460,13 @@ static int hisi_sllc_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int hisi_sllc_pmu_remove(struct platform_device *pdev)
+static void hisi_sllc_pmu_remove(struct platform_device *pdev)
 {
 	struct hisi_pmu *sllc_pmu = platform_get_drvdata(pdev);
 
 	perf_pmu_unregister(&sllc_pmu->pmu);
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
 					    &sllc_pmu->node);
-	return 0;
 }
 
 static struct platform_driver hisi_sllc_pmu_driver = {
@@ -477,7 +476,7 @@ static struct platform_driver hisi_sllc_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_sllc_pmu_probe,
-	.remove = hisi_sllc_pmu_remove,
+	.remove_new = hisi_sllc_pmu_remove,
 };
 
 static int __init hisi_sllc_pmu_module_init(void)

From c802bd9e354f321b40146bea45c62822f364fd05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:14 +0100
Subject: [PATCH 021/134] perf: marvell_cn10k_ddr: Convert to platform remove
 callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/79f48409f663f0184f03d34c6a86359ea3aa1291.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/marvell_cn10k_ddr_pmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c
index 524ba82bfce2..e2abca188dbe 100644
--- a/drivers/perf/marvell_cn10k_ddr_pmu.c
+++ b/drivers/perf/marvell_cn10k_ddr_pmu.c
@@ -697,7 +697,7 @@ error:
 	return ret;
 }
 
-static int cn10k_ddr_perf_remove(struct platform_device *pdev)
+static void cn10k_ddr_perf_remove(struct platform_device *pdev)
 {
 	struct cn10k_ddr_pmu *ddr_pmu = platform_get_drvdata(pdev);
 
@@ -706,7 +706,6 @@ static int cn10k_ddr_perf_remove(struct platform_device *pdev)
 				&ddr_pmu->node);
 
 	perf_pmu_unregister(&ddr_pmu->pmu);
-	return 0;
 }
 
 #ifdef CONFIG_OF
@@ -733,7 +732,7 @@ static struct platform_driver cn10k_ddr_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe		= cn10k_ddr_perf_probe,
-	.remove		= cn10k_ddr_perf_remove,
+	.remove_new	= cn10k_ddr_perf_remove,
 };
 
 static int __init cn10k_ddr_pmu_init(void)

From 86e8963f9dbac37a7427916da3c6e91f790e88dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:15 +0100
Subject: [PATCH 022/134] perf: marvell_cn10k_tad: Convert to platform remove
 callback returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/055656e474208b0fb583e249530fa211fa3be57c.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/marvell_cn10k_tad_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
index fec8e82edb95..9e635f355470 100644
--- a/drivers/perf/marvell_cn10k_tad_pmu.c
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -351,15 +351,13 @@ static int tad_pmu_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static int tad_pmu_remove(struct platform_device *pdev)
+static void tad_pmu_remove(struct platform_device *pdev)
 {
 	struct tad_pmu *pmu = platform_get_drvdata(pdev);
 
 	cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
 						&pmu->node);
 	perf_pmu_unregister(&pmu->pmu);
-
-	return 0;
 }
 
 #ifdef CONFIG_OF
@@ -385,7 +383,7 @@ static struct platform_driver tad_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe          = tad_pmu_probe,
-	.remove         = tad_pmu_remove,
+	.remove_new     = tad_pmu_remove,
 };
 
 static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)

From 8a2e438e710c674765cc52a59c5bf1d352f21bf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:16 +0100
Subject: [PATCH 023/134] perf: qcom_l2: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Link: https://lore.kernel.org/r/a2587688c54834482d68fe2a44f415a649ad6477.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/qcom_l2_pmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 3f9a98c17a89..148df5ae8ef8 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -965,7 +965,7 @@ out_unregister:
 	return err;
 }
 
-static int l2_cache_pmu_remove(struct platform_device *pdev)
+static void l2_cache_pmu_remove(struct platform_device *pdev)
 {
 	struct l2cache_pmu *l2cache_pmu =
 		to_l2cache_pmu(platform_get_drvdata(pdev));
@@ -973,7 +973,6 @@ static int l2_cache_pmu_remove(struct platform_device *pdev)
 	perf_pmu_unregister(&l2cache_pmu->pmu);
 	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 				    &l2cache_pmu->node);
-	return 0;
 }
 
 static struct platform_driver l2_cache_pmu_driver = {
@@ -983,7 +982,7 @@ static struct platform_driver l2_cache_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = l2_cache_pmu_probe,
-	.remove = l2_cache_pmu_remove,
+	.remove_new = l2_cache_pmu_remove,
 };
 
 static int __init register_l2_cache_pmu_driver(void)

From d4c5cef73b257d3377cd82dc0142fc374d32d38b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:17 +0100
Subject: [PATCH 024/134] perf: thunderx2: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/7be677dfa13d3a7eab6eef0d808ba8a9855d14ae.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/thunderx2_pmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c
index 1edb9c03704f..e16d10c763de 100644
--- a/drivers/perf/thunderx2_pmu.c
+++ b/drivers/perf/thunderx2_pmu.c
@@ -993,7 +993,7 @@ static int tx2_uncore_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static int tx2_uncore_remove(struct platform_device *pdev)
+static void tx2_uncore_remove(struct platform_device *pdev)
 {
 	struct tx2_uncore_pmu *tx2_pmu, *temp;
 	struct device *dev = &pdev->dev;
@@ -1009,7 +1009,6 @@ static int tx2_uncore_remove(struct platform_device *pdev)
 			}
 		}
 	}
-	return 0;
 }
 
 static struct platform_driver tx2_uncore_driver = {
@@ -1019,7 +1018,7 @@ static struct platform_driver tx2_uncore_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = tx2_uncore_probe,
-	.remove = tx2_uncore_remove,
+	.remove_new = tx2_uncore_remove,
 };
 
 static int __init tx2_uncore_driver_init(void)

From b07ebe8f14282bd67074d1be05bbca3f12a3c110 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 15 Dec 2023 14:59:18 +0100
Subject: [PATCH 025/134] perf: xgene: Convert to platform remove callback
 returning void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .remove() callback for a platform driver returns an int which makes
many driver authors wrongly assume it's possible to do error handling by
returning an error code. However the value returned is ignored (apart
from emitting a warning) and this typically results in resource leaks.

To improve here there is a quest to make the remove callback return
void. In the first step of this quest all drivers are converted to
.remove_new(), which already returns void. Eventually after all drivers
are converted, .remove_new() will be renamed to .remove().

Trivially convert this driver from always returning zero in the remove
callback to the void returning variant.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/75dda01b2ad6e17f726830094bd38cb8faab5cbe.1702648125.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/xgene_pmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index 7ce344248dda..0d49343d704b 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -1937,7 +1937,7 @@ xgene_pmu_dev_cleanup(struct xgene_pmu *xgene_pmu, struct list_head *pmus)
 	}
 }
 
-static int xgene_pmu_remove(struct platform_device *pdev)
+static void xgene_pmu_remove(struct platform_device *pdev)
 {
 	struct xgene_pmu *xgene_pmu = dev_get_drvdata(&pdev->dev);
 
@@ -1947,13 +1947,11 @@ static int xgene_pmu_remove(struct platform_device *pdev)
 	xgene_pmu_dev_cleanup(xgene_pmu, &xgene_pmu->mcpmus);
 	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE,
 				    &xgene_pmu->node);
-
-	return 0;
 }
 
 static struct platform_driver xgene_pmu_driver = {
 	.probe = xgene_pmu_probe,
-	.remove = xgene_pmu_remove,
+	.remove_new = xgene_pmu_remove,
 	.driver = {
 		.name		= "xgene-pmu",
 		.of_match_table = xgene_pmu_of_match,

From a1083ee717e9bde012268782e084d343314490a4 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 13 Dec 2023 16:24:07 +0000
Subject: [PATCH 026/134] perf/arm-cmn: Improve debugfs pretty-printing for
 large configs

The debugfs pretty-printer was written for the CMN-600 assumptions of a
maximum 8x8 mesh, but CMN-700 now allows coordinates and ID values up to
12 and 128 respectively, which can overflow the format strings, mess up
the alignment of the table and hurt overall readability. This table does
prove useful for double-checking that the driver is picking up the
topology of new systems correctly and for verifying user expectations,
so tweak the formatting to stay nice and readable with wider values.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1d1517eadd1bac5992fab679c9dc531b381944da.1702484646.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index ac7a953fe902..564da632b8d1 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -493,6 +493,7 @@ static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d)
 
 	for (dn = cmn->dns; dn->type; dn++) {
 		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+		int pad = dn->logid < 10;
 
 		if (dn->type == CMN_TYPE_XP)
 			continue;
@@ -503,7 +504,7 @@ static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d)
 		if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d)
 			continue;
 
-		seq_printf(s, "   #%-2d  |", dn->logid);
+		seq_printf(s, " %*c#%-*d  |", pad + 1, ' ', 3 - pad, dn->logid);
 		return;
 	}
 	seq_puts(s, "        |");
@@ -516,7 +517,7 @@ static int arm_cmn_map_show(struct seq_file *s, void *data)
 
 	seq_puts(s, "     X");
 	for (x = 0; x < cmn->mesh_x; x++)
-		seq_printf(s, "    %d    ", x);
+		seq_printf(s, "    %-2d   ", x);
 	seq_puts(s, "\nY P D+");
 	y = cmn->mesh_y;
 	while (y--) {
@@ -526,13 +527,13 @@ static int arm_cmn_map_show(struct seq_file *s, void *data)
 		for (x = 0; x < cmn->mesh_x; x++)
 			seq_puts(s, "--------+");
 
-		seq_printf(s, "\n%d    |", y);
+		seq_printf(s, "\n%-2d   |", y);
 		for (x = 0; x < cmn->mesh_x; x++) {
 			struct arm_cmn_node *xp = cmn->xps + xp_base + x;
 
 			for (p = 0; p < CMN_MAX_PORTS; p++)
 				port[p][x] = arm_cmn_device_connect_info(cmn, xp, p);
-			seq_printf(s, " XP #%-2d |", xp_base + x);
+			seq_printf(s, " XP #%-3d|", xp_base + x);
 		}
 
 		seq_puts(s, "\n     |");

From f82811e22b480a203a438d8e1f29af9c93ccbb0c Mon Sep 17 00:00:00 2001
From: Jamie Cunliffe <Jamie.Cunliffe@arm.com>
Date: Fri, 20 Oct 2023 16:50:55 +0100
Subject: [PATCH 027/134] rust: Refactor the build target to allow the use of
 builtin targets

Eventually we want all architectures to be using the target as defined
by rustc. However currently some architectures can't do that and are
using the target.json specification. This puts in place the foundation
to allow the use of the builtin target definition or a target.json
specification.

Signed-off-by: Jamie Cunliffe <Jamie.Cunliffe@arm.com>
Acked-by: Masahiro Yamada <masahiroy@kernel.org>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/r/20231020155056.3495121-2-Jamie.Cunliffe@arm.com
[catalin.marinas@arm.com: squashed loongarch ifneq fix from WANG Rui]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Makefile                | 1 -
 arch/loongarch/Makefile | 1 +
 arch/x86/Makefile       | 1 +
 rust/Makefile           | 5 ++++-
 scripts/Makefile        | 4 +++-
 5 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a171eafce2a3..c951c14fd4e7 100644
--- a/Makefile
+++ b/Makefile
@@ -561,7 +561,6 @@ KBUILD_CFLAGS += -fno-strict-aliasing
 
 KBUILD_CPPFLAGS := -D__KERNEL__
 KBUILD_RUSTFLAGS := $(rust_common_flags) \
-		    --target=$(objtree)/scripts/target.json \
 		    -Cpanic=abort -Cembed-bitcode=n -Clto=n \
 		    -Cforce-unwind-tables=n -Ccodegen-units=1 \
 		    -Csymbol-mangling-version=v0 \
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 983aa2b1629a..fa4fb09909ae 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -82,6 +82,7 @@ KBUILD_AFLAGS_MODULE		+= -Wa,-mla-global-with-abs
 KBUILD_CFLAGS_MODULE		+= -fplt -Wa,-mla-global-with-abs,-mla-local-with-abs
 endif
 
+KBUILD_RUSTFLAGS			+= --target=$(objtree)/scripts/target.json
 KBUILD_RUSTFLAGS_MODULE		+= -Crelocation-model=pic
 
 ifeq ($(CONFIG_RELOCATABLE),y)
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2264db14a25d..18cf8f0cf7cd 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -68,6 +68,7 @@ export BITS
 #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
 #
 KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
diff --git a/rust/Makefile b/rust/Makefile
index 9d2a16cc91cb..88f88a26e503 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -434,8 +434,11 @@ $(obj)/core.o: private skip_clippy = 1
 $(obj)/core.o: private skip_flags = -Dunreachable_pub
 $(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym))
 $(obj)/core.o: private rustc_target_flags = $(core-cfgs)
-$(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs scripts/target.json FORCE
+$(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs FORCE
 	$(call if_changed_dep,rustc_library)
+ifneq ($(or $(CONFIG_X86_64),$(CONFIG_LOONGARCH)),)
+$(obj)/core.o: scripts/target.json
+endif
 
 $(obj)/compiler_builtins.o: private rustc_objcopy = -w -W '__*'
 $(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE
diff --git a/scripts/Makefile b/scripts/Makefile
index 576cf64be667..6673cbb6194f 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -11,12 +11,14 @@ hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT)		+= sign-file
 hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE)	+= insert-sys-cert
 hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS)		+= rustdoc_test_builder
 hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS)		+= rustdoc_test_gen
-always-$(CONFIG_RUST)					+= target.json
 
+ifneq ($(or $(CONFIG_X86_64),$(CONFIG_LOONGARCH)),)
+always-$(CONFIG_RUST)					+= target.json
 filechk_rust_target = $< < include/config/auto.conf
 
 $(obj)/target.json: scripts/generate_rust_target include/config/auto.conf FORCE
 	$(call filechk,rust_target)
+endif
 
 hostprogs += generate_rust_target
 generate_rust_target-rust := y

From 724a75ac9542fe1f8aaa587da4d3863d8ea292fc Mon Sep 17 00:00:00 2001
From: Jamie Cunliffe <Jamie.Cunliffe@arm.com>
Date: Fri, 20 Oct 2023 16:50:56 +0100
Subject: [PATCH 028/134] arm64: rust: Enable Rust support for AArch64

This commit provides the build flags for Rust for AArch64. The core Rust
support already in the kernel does the rest. This enables the PAC ret
and BTI options in the Rust build flags to match the options that are
used when building C.

The Rust samples have been tested with this commit.

Signed-off-by: Jamie Cunliffe <Jamie.Cunliffe@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Dirk Behme <dirk.behme@de.bosch.com>
Tested-by: Boqun Feng <boqun.feng@gmail.com>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Tested-by: Fabien Parent <fabien.parent@linaro.org>
Link: https://lore.kernel.org/r/20231020155056.3495121-3-Jamie.Cunliffe@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/rust/arch-support.rst | 1 +
 arch/arm64/Kconfig                  | 1 +
 arch/arm64/Makefile                 | 4 ++++
 rust/Makefile                       | 1 +
 scripts/generate_rust_target.rs     | 4 +++-
 5 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst
index 73203ba1e901..5c4fa9f5d1cd 100644
--- a/Documentation/rust/arch-support.rst
+++ b/Documentation/rust/arch-support.rst
@@ -15,6 +15,7 @@ support corresponds to ``S`` values in the ``MAINTAINERS`` file.
 =============  ================  ==============================================
 Architecture   Level of support  Constraints
 =============  ================  ==============================================
+``arm64``      Maintained        Little Endian only.
 ``loongarch``  Maintained        -
 ``um``         Maintained        ``x86_64`` only.
 ``x86``        Maintained        ``x86_64`` only.
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d435139..8a5f3823242c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -229,6 +229,7 @@ config ARM64
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select MMU_GATHER_RCU_TABLE_FREE
 	select HAVE_RSEQ
+	select HAVE_RUST if CPU_LITTLE_ENDIAN
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index a88cdf910687..0e075d3c546b 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -41,6 +41,8 @@ KBUILD_CFLAGS	+= -mgeneral-regs-only	\
 KBUILD_CFLAGS	+= $(call cc-disable-warning, psabi)
 KBUILD_AFLAGS	+= $(compat_vdso)
 
+KBUILD_RUSTFLAGS += --target=aarch64-unknown-none -Ctarget-feature="-neon"
+
 KBUILD_CFLAGS	+= $(call cc-option,-mabi=lp64)
 KBUILD_AFLAGS	+= $(call cc-option,-mabi=lp64)
 
@@ -65,7 +67,9 @@ endif
 
 ifeq ($(CONFIG_ARM64_BTI_KERNEL),y)
   KBUILD_CFLAGS += -mbranch-protection=pac-ret+bti
+  KBUILD_RUSTFLAGS += -Zbranch-protection=bti,pac-ret
 else ifeq ($(CONFIG_ARM64_PTR_AUTH_KERNEL),y)
+  KBUILD_RUSTFLAGS += -Zbranch-protection=pac-ret
   ifeq ($(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET),y)
     KBUILD_CFLAGS += -mbranch-protection=pac-ret
   else
diff --git a/rust/Makefile b/rust/Makefile
index 88f88a26e503..fe045dbc701e 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -297,6 +297,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \
 
 # Derived from `scripts/Makefile.clang`.
 BINDGEN_TARGET_x86	:= x86_64-linux-gnu
+BINDGEN_TARGET_arm64	:= aarch64-linux-gnu
 BINDGEN_TARGET		:= $(BINDGEN_TARGET_$(SRCARCH))
 
 # All warnings are inhibited since GCC builds are very experimental,
diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs
index 0da52b548ba5..9e117957ea7b 100644
--- a/scripts/generate_rust_target.rs
+++ b/scripts/generate_rust_target.rs
@@ -148,7 +148,9 @@ fn main() {
     let mut ts = TargetSpec::new();
 
     // `llvm-target`s are taken from `scripts/Makefile.clang`.
-    if cfg.has("X86_64") {
+    if cfg.has("ARM64") {
+        panic!("arm64 uses the builtin rustc aarch64-unknown-none target");
+    } else if cfg.has("X86_64") {
         ts.push("arch", "x86_64");
         ts.push(
             "data-layout",

From df2675ad7748ea1a4bdb47f9835a30f50d4dc5ea Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 6 Feb 2024 10:27:54 +0000
Subject: [PATCH 029/134] perf/arm_cspmu: Simplify initialisation

It's far simpler for implementations to literally override whichever
default ops they want to, by initialising to the default ops first. This
saves all the bother of checking what the impl_init_ops call has or
hasn't touched. Make the same clear distinction for the PMIIDR override
as well, in case we gain more sources for overriding that in future.

Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/dd39718ee4890fd46a8e443c25303e87ae23f422.1706718007.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c    | 55 +++++++++++----------------
 drivers/perf/arm_cspmu/nvidia_cspmu.c |  6 ---
 2 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 3513bf50fefa..95e78290571e 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -100,13 +100,6 @@
 #define ARM_CSPMU_ACTIVE_CPU_MASK		0x0
 #define ARM_CSPMU_ASSOCIATED_CPU_MASK		0x1
 
-/* Check and use default if implementer doesn't provide attribute callback */
-#define CHECK_DEFAULT_IMPL_OPS(ops, callback)			\
-	do {							\
-		if (!ops->callback)				\
-			ops->callback = arm_cspmu_ ## callback;	\
-	} while (0)
-
 /*
  * Maximum poll count for reading counter value using high-low-high sequence.
  */
@@ -408,21 +401,32 @@ static struct arm_cspmu_impl_match *arm_cspmu_impl_match_get(u32 pmiidr)
 	return NULL;
 }
 
+#define DEFAULT_IMPL_OP(name)	.name = arm_cspmu_##name
+
 static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 {
 	int ret = 0;
-	struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops;
 	struct acpi_apmt_node *apmt_node = arm_cspmu_apmt_node(cspmu->dev);
 	struct arm_cspmu_impl_match *match;
 
-	/*
-	 * Get PMU implementer and product id from APMT node.
-	 * If APMT node doesn't have implementer/product id, try get it
-	 * from PMIIDR.
-	 */
-	cspmu->impl.pmiidr =
-		(apmt_node->impl_id) ? apmt_node->impl_id :
-				       readl(cspmu->base0 + PMIIDR);
+	/* Start with a default PMU implementation */
+	cspmu->impl.module = THIS_MODULE;
+	cspmu->impl.pmiidr = readl(cspmu->base0 + PMIIDR);
+	cspmu->impl.ops = (struct arm_cspmu_impl_ops) {
+		DEFAULT_IMPL_OP(get_event_attrs),
+		DEFAULT_IMPL_OP(get_format_attrs),
+		DEFAULT_IMPL_OP(get_identifier),
+		DEFAULT_IMPL_OP(get_name),
+		DEFAULT_IMPL_OP(is_cycle_counter_event),
+		DEFAULT_IMPL_OP(event_type),
+		DEFAULT_IMPL_OP(event_filter),
+		DEFAULT_IMPL_OP(set_ev_filter),
+		DEFAULT_IMPL_OP(event_attr_is_visible),
+	};
+
+	/* Firmware may override implementer/product ID from PMIIDR */
+	if (apmt_node->impl_id)
+		cspmu->impl.pmiidr = apmt_node->impl_id;
 
 	/* Find implementer specific attribute ops. */
 	match = arm_cspmu_impl_match_get(cspmu->impl.pmiidr);
@@ -450,24 +454,9 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 		}
 
 		mutex_unlock(&arm_cspmu_lock);
+	}
 
-		if (ret)
-			return ret;
-	} else
-		cspmu->impl.module = THIS_MODULE;
-
-	/* Use default callbacks if implementer doesn't provide one. */
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, get_event_attrs);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, get_format_attrs);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, get_identifier);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, get_name);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, is_cycle_counter_event);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, event_type);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, event_filter);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, event_attr_is_visible);
-	CHECK_DEFAULT_IMPL_OPS(impl_ops, set_ev_filter);
-
-	return 0;
+	return ret;
 }
 
 static struct attribute_group *
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c
index 0382b702f092..5b84b701ad62 100644
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.c
+++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c
@@ -388,12 +388,6 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
 	impl_ops->get_format_attrs		= nv_cspmu_get_format_attrs;
 	impl_ops->get_name			= nv_cspmu_get_name;
 
-	/* Set others to NULL to use default callback. */
-	impl_ops->event_type			= NULL;
-	impl_ops->event_attr_is_visible		= NULL;
-	impl_ops->get_identifier		= NULL;
-	impl_ops->is_cycle_counter_event	= NULL;
-
 	return 0;
 }
 

From 7e6a3c3f85886f5e54dba5eb3bee7f5400685e95 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 6 Feb 2024 10:27:55 +0000
Subject: [PATCH 030/134] perf/arm_cspmu: Simplify attribute groups

The attribute group array itself is always the same, so there's no
need to allocate it separately. Storing it directly in our instance
data saves memory and gives us one less point of failure.

Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/cf12b803114b0815438833fcb2495f20f2007761.1706718007.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 26 +++++++++-----------------
 drivers/perf/arm_cspmu/arm_cspmu.h |  1 +
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 95e78290571e..e87bf0f64f01 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -501,23 +501,16 @@ arm_cspmu_alloc_format_attr_group(struct arm_cspmu *cspmu)
 	return format_group;
 }
 
-static struct attribute_group **
-arm_cspmu_alloc_attr_group(struct arm_cspmu *cspmu)
+static int arm_cspmu_alloc_attr_groups(struct arm_cspmu *cspmu)
 {
-	struct attribute_group **attr_groups = NULL;
-	struct device *dev = cspmu->dev;
+	const struct attribute_group **attr_groups = cspmu->attr_groups;
 	const struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops;
 
 	cspmu->identifier = impl_ops->get_identifier(cspmu);
 	cspmu->name = impl_ops->get_name(cspmu);
 
 	if (!cspmu->identifier || !cspmu->name)
-		return NULL;
-
-	attr_groups = devm_kcalloc(dev, 5, sizeof(struct attribute_group *),
-				   GFP_KERNEL);
-	if (!attr_groups)
-		return NULL;
+		return -ENOMEM;
 
 	attr_groups[0] = arm_cspmu_alloc_event_attr_group(cspmu);
 	attr_groups[1] = arm_cspmu_alloc_format_attr_group(cspmu);
@@ -525,9 +518,9 @@ arm_cspmu_alloc_attr_group(struct arm_cspmu *cspmu)
 	attr_groups[3] = &arm_cspmu_cpumask_attr_group;
 
 	if (!attr_groups[0] || !attr_groups[1])
-		return NULL;
+		return -ENOMEM;
 
-	return attr_groups;
+	return 0;
 }
 
 static inline void arm_cspmu_reset_counters(struct arm_cspmu *cspmu)
@@ -1164,11 +1157,10 @@ static int arm_cspmu_get_cpus(struct arm_cspmu *cspmu)
 static int arm_cspmu_register_pmu(struct arm_cspmu *cspmu)
 {
 	int ret, capabilities;
-	struct attribute_group **attr_groups;
 
-	attr_groups = arm_cspmu_alloc_attr_group(cspmu);
-	if (!attr_groups)
-		return -ENOMEM;
+	ret = arm_cspmu_alloc_attr_groups(cspmu);
+	if (ret)
+		return ret;
 
 	ret = cpuhp_state_add_instance(arm_cspmu_cpuhp_state,
 				       &cspmu->cpuhp_node);
@@ -1190,7 +1182,7 @@ static int arm_cspmu_register_pmu(struct arm_cspmu *cspmu)
 		.start		= arm_cspmu_start,
 		.stop		= arm_cspmu_stop,
 		.read		= arm_cspmu_read,
-		.attr_groups	= (const struct attribute_group **)attr_groups,
+		.attr_groups	= cspmu->attr_groups,
 		.capabilities	= capabilities,
 	};
 
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index 2fe723555a6b..c9163acfe810 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -157,6 +157,7 @@ struct arm_cspmu {
 	int cycle_counter_logical_idx;
 
 	struct arm_cspmu_hw_events hw_events;
+	const struct attribute_group *attr_groups[5];
 
 	struct arm_cspmu_impl impl;
 };

From e7e8fa8e82afddb3c9ff56d6a7030558776c5b1c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 6 Feb 2024 10:27:56 +0000
Subject: [PATCH 031/134] perf/arm_cspmu: Simplify counter reset

arm_cspmu_reset_counters() inherently also stops them since it is
writing 0 to PMCR.E, so there should be no need to do that twice.
Also tidy up the reset routine itself for consistency with the start
and stop routines, and to be clear at first glance that it is simply
writing a constant value.

Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/3105815327989f6bb7bb068994d0eb4096b4ef64.1706718007.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index e87bf0f64f01..5919ef63c2a8 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -525,11 +525,7 @@ static int arm_cspmu_alloc_attr_groups(struct arm_cspmu *cspmu)
 
 static inline void arm_cspmu_reset_counters(struct arm_cspmu *cspmu)
 {
-	u32 pmcr = 0;
-
-	pmcr |= PMCR_P;
-	pmcr |= PMCR_C;
-	writel(pmcr, cspmu->base0 + PMCR);
+	writel(PMCR_C | PMCR_P, cspmu->base0 + PMCR);
 }
 
 static inline void arm_cspmu_start_counters(struct arm_cspmu *cspmu)
@@ -1187,7 +1183,6 @@ static int arm_cspmu_register_pmu(struct arm_cspmu *cspmu)
 	};
 
 	/* Hardware counter init */
-	arm_cspmu_stop_counters(cspmu);
 	arm_cspmu_reset_counters(cspmu);
 
 	ret = perf_pmu_register(&cspmu->pmu, cspmu->name, -1);

From 7255cfb19941b4681e545be47b9f13b61b1b4cb6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 6 Feb 2024 10:27:57 +0000
Subject: [PATCH 032/134] dt-bindings/perf: Add Arm CoreSight PMU

Add a binding for implementations of the Arm CoreSight Performance
Monitoring Unit Architecture. Not to be confused with CoreSight debug
and trace, the PMU architecture defines a standard MMIO interface for
event counters following a similar design to the CPU PMU architecture,
where the implementation and most of its features are discoverable
through ID registers.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/c62a86ef177bec5c6d12176c605de900e9e40c87.1706718007.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../bindings/perf/arm,coresight-pmu.yaml      | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/perf/arm,coresight-pmu.yaml

diff --git a/Documentation/devicetree/bindings/perf/arm,coresight-pmu.yaml b/Documentation/devicetree/bindings/perf/arm,coresight-pmu.yaml
new file mode 100644
index 000000000000..985b62990f80
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/arm,coresight-pmu.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/arm,coresight-pmu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm Coresight Performance Monitoring Unit Architecture
+
+maintainers:
+  - Robin Murphy <robin.murphy@arm.com>
+
+properties:
+  compatible:
+    const: arm,coresight-pmu
+
+  reg:
+    items:
+      - description: Register page 0
+      - description: Register page 1, if the PMU implements the dual-page extension
+    minItems: 1
+
+  interrupts:
+    items:
+      - description: Overflow interrupt
+
+  cpus:
+    description: If the PMU is associated with a particular CPU or subset of CPUs,
+      array of phandles to the appropriate CPU node(s)
+
+  reg-io-width:
+    description: Granularity at which PMU register accesses are single-copy atomic
+    default: 4
+    enum: [4, 8]
+
+required:
+  - compatible
+  - reg
+
+additionalProperties: false

From fd185a245155be9cb90839fa451ba8f2c3e4004c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 6 Feb 2024 10:27:58 +0000
Subject: [PATCH 033/134] perf/arm_cspmu: Add devicetree support

Hook up devicetree probing support. For now let's hope that people
implement PMIIDR properly and we don't need an override property or
match data mechanism.

Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Besar Wicaksono <bwicaksono@nvidia.com>
Tested-by: Besar Wicaksono <bwicaksono@nvidia.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/836722034302ff62f2df56aaeb0036e71945a5d1.1706718007.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 67 ++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 12 deletions(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 5919ef63c2a8..b9a252272f1e 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -27,6 +27,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/of.h>
 #include <linux/perf_event.h>
 #include <linux/platform_device.h>
 
@@ -114,7 +115,9 @@ static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
 
 static struct acpi_apmt_node *arm_cspmu_apmt_node(struct device *dev)
 {
-	return *(struct acpi_apmt_node **)dev_get_platdata(dev);
+	struct acpi_apmt_node **ptr = dev_get_platdata(dev);
+
+	return ptr ? *ptr : NULL;
 }
 
 /*
@@ -310,6 +313,10 @@ static const char *arm_cspmu_get_name(const struct arm_cspmu *cspmu)
 
 	dev = cspmu->dev;
 	apmt_node = arm_cspmu_apmt_node(dev);
+	if (!apmt_node)
+		return devm_kasprintf(dev, GFP_KERNEL, PMUNAME "_%u",
+				      atomic_fetch_inc(&pmu_idx[0]));
+
 	pmu_type = apmt_node->type;
 
 	if (pmu_type >= ACPI_APMT_NODE_TYPE_COUNT) {
@@ -425,7 +432,7 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 	};
 
 	/* Firmware may override implementer/product ID from PMIIDR */
-	if (apmt_node->impl_id)
+	if (apmt_node && apmt_node->impl_id)
 		cspmu->impl.pmiidr = apmt_node->impl_id;
 
 	/* Find implementer specific attribute ops. */
@@ -940,7 +947,14 @@ static struct arm_cspmu *arm_cspmu_alloc(struct platform_device *pdev)
 	platform_set_drvdata(pdev, cspmu);
 
 	apmt_node = arm_cspmu_apmt_node(dev);
-	cspmu->has_atomic_dword = apmt_node->flags & ACPI_APMT_FLAGS_ATOMIC;
+	if (apmt_node) {
+		cspmu->has_atomic_dword = apmt_node->flags & ACPI_APMT_FLAGS_ATOMIC;
+	} else {
+		u32 width = 0;
+
+		device_property_read_u32(dev, "reg-io-width", &width);
+		cspmu->has_atomic_dword = (width == 8);
+	}
 
 	return cspmu;
 }
@@ -1131,11 +1145,6 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu)
 		}
 	}
 
-	if (cpumask_empty(&cspmu->associated_cpus)) {
-		dev_dbg(cspmu->dev, "No cpu associated with the PMU\n");
-		return -ENODEV;
-	}
-
 	return 0;
 }
 #else
@@ -1145,9 +1154,36 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu)
 }
 #endif
 
+static int arm_cspmu_of_get_cpus(struct arm_cspmu *cspmu)
+{
+	struct of_phandle_iterator it;
+	int ret, cpu;
+
+	of_for_each_phandle(&it, ret, dev_of_node(cspmu->dev), "cpus", NULL, 0) {
+		cpu = of_cpu_node_to_id(it.node);
+		if (cpu < 0)
+			continue;
+		cpumask_set_cpu(cpu, &cspmu->associated_cpus);
+	}
+	return ret == -ENOENT ? 0 : ret;
+}
+
 static int arm_cspmu_get_cpus(struct arm_cspmu *cspmu)
 {
-	return arm_cspmu_acpi_get_cpus(cspmu);
+	int ret = 0;
+
+	if (arm_cspmu_apmt_node(cspmu->dev))
+		ret = arm_cspmu_acpi_get_cpus(cspmu);
+	else if (device_property_present(cspmu->dev, "cpus"))
+		ret = arm_cspmu_of_get_cpus(cspmu);
+	else
+		cpumask_copy(&cspmu->associated_cpus, cpu_possible_mask);
+
+	if (!ret && cpumask_empty(&cspmu->associated_cpus)) {
+		dev_dbg(cspmu->dev, "No cpu associated with the PMU\n");
+		ret = -ENODEV;
+	}
+	return ret;
 }
 
 static int arm_cspmu_register_pmu(struct arm_cspmu *cspmu)
@@ -1242,11 +1278,18 @@ static const struct platform_device_id arm_cspmu_id[] = {
 };
 MODULE_DEVICE_TABLE(platform, arm_cspmu_id);
 
+static const struct of_device_id arm_cspmu_of_match[] = {
+	{ .compatible = "arm,coresight-pmu" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, arm_cspmu_of_match);
+
 static struct platform_driver arm_cspmu_driver = {
 	.driver = {
-			.name = DRVNAME,
-			.suppress_bind_attrs = true,
-		},
+		.name = DRVNAME,
+		.of_match_table = arm_cspmu_of_match,
+		.suppress_bind_attrs = true,
+	},
 	.probe = arm_cspmu_device_probe,
 	.remove_new = arm_cspmu_device_remove,
 	.id_table = arm_cspmu_id,

From 48157aa39286b8eddfb81eeaab4d64d0231450e7 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:47 +0100
Subject: [PATCH 034/134] arm64: kernel: Manage absolute relocations in code
 built under pi/

The mini C runtime runs before relocations are processed, and so it
cannot rely on statically initialized pointer variables.

Add a check to ensure that such code does not get introduced by
accident, by going over the relocations in each object, identifying the
ones that operate on data sections that are part of the executable
image, and raising an error if any relocations of type R_AARCH64_ABS64
exist. Note that such relocations are permitted in other places (e.g.,
debug sections) and will never occur in compiler generated code sections
when using the small code model, so only check sections that have
SHF_ALLOC set and SHF_EXECINSTR cleared.

To accommodate cases where statically initialized symbol references are
unavoidable, introduce a special case for ELF input data sections that
have ".rodata.prel64" in their names, and in these cases, instead of
rejecting any encountered ABS64 relocations, convert them into PREL64
relocations, which don't require any runtime fixups. Note that the code
in question must still be modified to deal with this, as it needs to
convert the 64-bit signed offsets into absolute addresses before use.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-46-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/pi/Makefile    |   9 ++-
 arch/arm64/kernel/pi/pi.h        |  18 +++++
 arch/arm64/kernel/pi/relacheck.c | 130 +++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/kernel/pi/pi.h
 create mode 100644 arch/arm64/kernel/pi/relacheck.c

diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index c844a0546d7f..bc32a431fe35 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -22,11 +22,16 @@ KCSAN_SANITIZE	:= n
 UBSAN_SANITIZE	:= n
 KCOV_INSTRUMENT	:= n
 
+hostprogs	:= relacheck
+
+quiet_cmd_piobjcopy = $(quiet_cmd_objcopy)
+      cmd_piobjcopy = $(cmd_objcopy) && $(obj)/relacheck $(@) $(<)
+
 $(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \
 			       --remove-section=.note.gnu.property \
 			       --prefix-alloc-sections=.init
-$(obj)/%.pi.o: $(obj)/%.o FORCE
-	$(call if_changed,objcopy)
+$(obj)/%.pi.o: $(obj)/%.o $(obj)/relacheck FORCE
+	$(call if_changed,piobjcopy)
 
 $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
new file mode 100644
index 000000000000..7c2d9bbf0ff9
--- /dev/null
+++ b/arch/arm64/kernel/pi/pi.h
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2023 Google LLC
+// Author: Ard Biesheuvel <ardb@google.com>
+
+#define __prel64_initconst	__section(".init.rodata.prel64")
+
+#define PREL64(type, name)	union { type *name; prel64_t name ## _prel; }
+
+#define prel64_pointer(__d)	(typeof(__d))prel64_to_pointer(&__d##_prel)
+
+typedef volatile signed long prel64_t;
+
+static inline void *prel64_to_pointer(const prel64_t *offset)
+{
+	if (!*offset)
+		return NULL;
+	return (void *)offset + *offset;
+}
diff --git a/arch/arm64/kernel/pi/relacheck.c b/arch/arm64/kernel/pi/relacheck.c
new file mode 100644
index 000000000000..b0cd4d0d275b
--- /dev/null
+++ b/arch/arm64/kernel/pi/relacheck.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 - Google LLC
+ * Author: Ard Biesheuvel <ardb@google.com>
+ */
+
+#include <elf.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define HOST_ORDER ELFDATA2LSB
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define HOST_ORDER ELFDATA2MSB
+#endif
+
+static Elf64_Ehdr *ehdr;
+static Elf64_Shdr *shdr;
+static const char *strtab;
+static bool swap;
+
+static uint64_t swab_elfxword(uint64_t val)
+{
+	return swap ? __builtin_bswap64(val) : val;
+}
+
+static uint32_t swab_elfword(uint32_t val)
+{
+	return swap ? __builtin_bswap32(val) : val;
+}
+
+static uint16_t swab_elfhword(uint16_t val)
+{
+	return swap ? __builtin_bswap16(val) : val;
+}
+
+int main(int argc, char *argv[])
+{
+	struct stat stat;
+	int fd, ret;
+
+	if (argc < 3) {
+		fprintf(stderr, "file arguments missing\n");
+		exit(EXIT_FAILURE);
+	}
+
+	fd = open(argv[1], O_RDWR);
+	if (fd < 0) {
+		fprintf(stderr, "failed to open %s\n", argv[1]);
+		exit(EXIT_FAILURE);
+	}
+
+	ret = fstat(fd, &stat);
+	if (ret < 0) {
+		fprintf(stderr, "failed to stat() %s\n", argv[1]);
+		exit(EXIT_FAILURE);
+	}
+
+	ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (ehdr == MAP_FAILED) {
+		fprintf(stderr, "failed to mmap() %s\n", argv[1]);
+		exit(EXIT_FAILURE);
+	}
+
+	swap = ehdr->e_ident[EI_DATA] != HOST_ORDER;
+	shdr = (void *)ehdr + swab_elfxword(ehdr->e_shoff);
+	strtab = (void *)ehdr +
+		 swab_elfxword(shdr[swab_elfhword(ehdr->e_shstrndx)].sh_offset);
+
+	for (int i = 0; i < swab_elfhword(ehdr->e_shnum); i++) {
+		unsigned long info, flags;
+		bool prel64 = false;
+		Elf64_Rela *rela;
+		int numrela;
+
+		if (swab_elfword(shdr[i].sh_type) != SHT_RELA)
+			continue;
+
+		/* only consider RELA sections operating on data */
+		info = swab_elfword(shdr[i].sh_info);
+		flags = swab_elfxword(shdr[info].sh_flags);
+		if ((flags & (SHF_ALLOC | SHF_EXECINSTR)) != SHF_ALLOC)
+			continue;
+
+		/*
+		 * We generally don't permit ABS64 relocations in the code that
+		 * runs before relocation processing occurs. If statically
+		 * initialized absolute symbol references are unavoidable, they
+		 * may be emitted into a *.rodata.prel64 section and they will
+		 * be converted to place-relative 64-bit references. This
+		 * requires special handling in the referring code.
+		 */
+		if (strstr(strtab + swab_elfword(shdr[info].sh_name),
+			   ".rodata.prel64")) {
+			prel64 = true;
+		}
+
+		rela = (void *)ehdr + swab_elfxword(shdr[i].sh_offset);
+		numrela = swab_elfxword(shdr[i].sh_size) / sizeof(*rela);
+
+		for (int j = 0; j < numrela; j++) {
+			uint64_t info = swab_elfxword(rela[j].r_info);
+
+			if (ELF64_R_TYPE(info) != R_AARCH64_ABS64)
+				continue;
+
+			if (prel64) {
+				/* convert ABS64 into PREL64 */
+				info ^= R_AARCH64_ABS64 ^ R_AARCH64_PREL64;
+				rela[j].r_info = swab_elfxword(info);
+			} else {
+				fprintf(stderr,
+					"Unexpected absolute relocations detected in %s\n",
+					argv[2]);
+				close(fd);
+				unlink(argv[1]);
+				exit(EXIT_FAILURE);
+			}
+		}
+	}
+	close(fd);
+	return 0;
+}

From a86aa72eb3b075b985473d1d2973c7d00f568f17 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:48 +0100
Subject: [PATCH 035/134] arm64: kernel: Don't rely on objcopy to make code
 under pi/ __init

We will add some code under pi/ that contains global variables that
should not end up in __initdata, as they will not be writable via the
initial ID map. So only rely on objcopy for making the libfdt code
__init, and use explicit annotations for the rest.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-47-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/pi/Makefile      |  6 ++++--
 arch/arm64/kernel/pi/kaslr_early.c | 16 +++++++++-------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index bc32a431fe35..2bbe866417d4 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -28,11 +28,13 @@ quiet_cmd_piobjcopy = $(quiet_cmd_objcopy)
       cmd_piobjcopy = $(cmd_objcopy) && $(obj)/relacheck $(@) $(<)
 
 $(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \
-			       --remove-section=.note.gnu.property \
-			       --prefix-alloc-sections=.init
+			       --remove-section=.note.gnu.property
 $(obj)/%.pi.o: $(obj)/%.o $(obj)/relacheck FORCE
 	$(call if_changed,piobjcopy)
 
+# ensure that all the lib- code ends up as __init code and data
+$(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init
+
 $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c
index b9e0bb4bc6a9..167081b30a15 100644
--- a/arch/arm64/kernel/pi/kaslr_early.c
+++ b/arch/arm64/kernel/pi/kaslr_early.c
@@ -17,7 +17,7 @@
 #include <asm/pgtable.h>
 
 /* taken from lib/string.c */
-static char *__strstr(const char *s1, const char *s2)
+static char *__init __strstr(const char *s1, const char *s2)
 {
 	size_t l1, l2;
 
@@ -33,7 +33,7 @@ static char *__strstr(const char *s1, const char *s2)
 	}
 	return NULL;
 }
-static bool cmdline_contains_nokaslr(const u8 *cmdline)
+static bool __init cmdline_contains_nokaslr(const u8 *cmdline)
 {
 	const u8 *str;
 
@@ -41,7 +41,7 @@ static bool cmdline_contains_nokaslr(const u8 *cmdline)
 	return str == cmdline || (str > cmdline && *(str - 1) == ' ');
 }
 
-static bool is_kaslr_disabled_cmdline(void *fdt)
+static bool __init is_kaslr_disabled_cmdline(void *fdt)
 {
 	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
 		int node;
@@ -67,17 +67,19 @@ out:
 	return cmdline_contains_nokaslr(CONFIG_CMDLINE);
 }
 
-static u64 get_kaslr_seed(void *fdt)
+static u64 __init get_kaslr_seed(void *fdt)
 {
+	static char const chosen_str[] __initconst = "chosen";
+	static char const seed_str[] __initconst = "kaslr-seed";
 	int node, len;
 	fdt64_t *prop;
 	u64 ret;
 
-	node = fdt_path_offset(fdt, "/chosen");
+	node = fdt_path_offset(fdt, chosen_str);
 	if (node < 0)
 		return 0;
 
-	prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+	prop = fdt_getprop_w(fdt, node, seed_str, &len);
 	if (!prop || len != sizeof(u64))
 		return 0;
 
@@ -86,7 +88,7 @@ static u64 get_kaslr_seed(void *fdt)
 	return ret;
 }
 
-asmlinkage u64 kaslr_early_init(void *fdt)
+asmlinkage u64 __init kaslr_early_init(void *fdt)
 {
 	u64 seed, range;
 

From 734958ef0b5497b1b9cb827afb541e7825477bbd Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:49 +0100
Subject: [PATCH 036/134] arm64: head: move relocation handling to C code

Now that we have a mini C runtime before the kernel mapping is up, we
can move the non-trivial relocation processing code out of head.S and
reimplement it in C.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-48-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/Makefile      |   3 +-
 arch/arm64/kernel/head.S        | 104 +++-----------------------------
 arch/arm64/kernel/pi/Makefile   |   5 +-
 arch/arm64/kernel/pi/relocate.c |  62 +++++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S |  12 ++--
 5 files changed, 82 insertions(+), 104 deletions(-)
 create mode 100644 arch/arm64/kernel/pi/relocate.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 467cb7117273..78f14084f6d7 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -57,7 +57,8 @@ obj-$(CONFIG_ACPI)			+= acpi.o
 obj-$(CONFIG_ACPI_NUMA)			+= acpi_numa.o
 obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)	+= acpi_parking_protocol.o
 obj-$(CONFIG_PARAVIRT)			+= paravirt.o
-obj-$(CONFIG_RANDOMIZE_BASE)		+= kaslr.o pi/
+obj-$(CONFIG_RELOCATABLE)		+= pi/
+obj-$(CONFIG_RANDOMIZE_BASE)		+= kaslr.o
 obj-$(CONFIG_HIBERNATION)		+= hibernate.o hibernate-asm.o
 obj-$(CONFIG_ELF_CORE)			+= elfcore.o
 obj-$(CONFIG_KEXEC_CORE)		+= machine_kexec.o relocate_kernel.o	\
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cab7f91949d8..a8fa64fc30d7 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -81,7 +81,7 @@
 	 *  x20        primary_entry() .. __primary_switch()    CPU boot mode
 	 *  x21        primary_entry() .. start_kernel()        FDT pointer passed at boot in x0
 	 *  x22        create_idmap() .. start_kernel()         ID map VA of the DT blob
-	 *  x23        primary_entry() .. start_kernel()        physical misalignment/KASLR offset
+	 *  x23        __primary_switch()                       physical misalignment/KASLR offset
 	 *  x24        __primary_switch()                       linear map KASLR seed
 	 *  x25        primary_entry() .. start_kernel()        supported VA size
 	 *  x28        create_idmap()                           callee preserved temp register
@@ -389,7 +389,7 @@ SYM_FUNC_START_LOCAL(create_idmap)
 	/* Remap the kernel page tables r/w in the ID map */
 	adrp	x1, _text
 	adrp	x2, init_pg_dir
-	adrp	x3, init_pg_end
+	adrp	x3, _end
 	bic	x4, x2, #SWAPPER_BLOCK_SIZE - 1
 	mov_q	x5, SWAPPER_RW_MMUFLAGS
 	mov	x6, #SWAPPER_BLOCK_SHIFT
@@ -779,97 +779,6 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
 	b	1b
 SYM_FUNC_END(__no_granule_support)
 
-#ifdef CONFIG_RELOCATABLE
-SYM_FUNC_START_LOCAL(__relocate_kernel)
-	/*
-	 * Iterate over each entry in the relocation table, and apply the
-	 * relocations in place.
-	 */
-	adr_l	x9, __rela_start
-	adr_l	x10, __rela_end
-	mov_q	x11, KIMAGE_VADDR		// default virtual offset
-	add	x11, x11, x23			// actual virtual offset
-
-0:	cmp	x9, x10
-	b.hs	1f
-	ldp	x12, x13, [x9], #24
-	ldr	x14, [x9, #-8]
-	cmp	w13, #R_AARCH64_RELATIVE
-	b.ne	0b
-	add	x14, x14, x23			// relocate
-	str	x14, [x12, x23]
-	b	0b
-
-1:
-#ifdef CONFIG_RELR
-	/*
-	 * Apply RELR relocations.
-	 *
-	 * RELR is a compressed format for storing relative relocations. The
-	 * encoded sequence of entries looks like:
-	 * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ]
-	 *
-	 * i.e. start with an address, followed by any number of bitmaps. The
-	 * address entry encodes 1 relocation. The subsequent bitmap entries
-	 * encode up to 63 relocations each, at subsequent offsets following
-	 * the last address entry.
-	 *
-	 * The bitmap entries must have 1 in the least significant bit. The
-	 * assumption here is that an address cannot have 1 in lsb. Odd
-	 * addresses are not supported. Any odd addresses are stored in the RELA
-	 * section, which is handled above.
-	 *
-	 * Excluding the least significant bit in the bitmap, each non-zero
-	 * bit in the bitmap represents a relocation to be applied to
-	 * a corresponding machine word that follows the base address
-	 * word. The second least significant bit represents the machine
-	 * word immediately following the initial address, and each bit
-	 * that follows represents the next word, in linear order. As such,
-	 * a single bitmap can encode up to 63 relocations in a 64-bit object.
-	 *
-	 * In this implementation we store the address of the next RELR table
-	 * entry in x9, the address being relocated by the current address or
-	 * bitmap entry in x13 and the address being relocated by the current
-	 * bit in x14.
-	 */
-	adr_l	x9, __relr_start
-	adr_l	x10, __relr_end
-
-2:	cmp	x9, x10
-	b.hs	7f
-	ldr	x11, [x9], #8
-	tbnz	x11, #0, 3f			// branch to handle bitmaps
-	add	x13, x11, x23
-	ldr	x12, [x13]			// relocate address entry
-	add	x12, x12, x23
-	str	x12, [x13], #8			// adjust to start of bitmap
-	b	2b
-
-3:	mov	x14, x13
-4:	lsr	x11, x11, #1
-	cbz	x11, 6f
-	tbz	x11, #0, 5f			// skip bit if not set
-	ldr	x12, [x14]			// relocate bit
-	add	x12, x12, x23
-	str	x12, [x14]
-
-5:	add	x14, x14, #8			// move to next bit's address
-	b	4b
-
-6:	/*
-	 * Move to the next bitmap's address. 8 is the word size, and 63 is the
-	 * number of significant bits in a bitmap entry.
-	 */
-	add	x13, x13, #(8 * 63)
-	b	2b
-
-7:
-#endif
-	ret
-
-SYM_FUNC_END(__relocate_kernel)
-#endif
-
 SYM_FUNC_START_LOCAL(__primary_switch)
 	adrp	x1, reserved_pg_dir
 	adrp	x2, init_idmap_pg_dir
@@ -877,11 +786,11 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 #ifdef CONFIG_RELOCATABLE
 	adrp	x23, KERNEL_START
 	and	x23, x23, MIN_KIMG_ALIGN - 1
-#ifdef CONFIG_RANDOMIZE_BASE
-	mov	x0, x22
-	adrp	x1, init_pg_end
+	adrp	x1, early_init_stack
 	mov	sp, x1
 	mov	x29, xzr
+#ifdef CONFIG_RANDOMIZE_BASE
+	mov	x0, x22
 	bl	__pi_kaslr_early_init
 	and	x24, x0, #SZ_2M - 1		// capture memstart offset seed
 	bic	x0, x0, #SZ_2M - 1
@@ -894,7 +803,8 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	adrp	x1, init_pg_dir
 	load_ttbr1 x1, x1, x2
 #ifdef CONFIG_RELOCATABLE
-	bl	__relocate_kernel
+	mov	x0, x23
+	bl	__pi_relocate_kernel
 #endif
 	ldr	x8, =__primary_switched
 	adrp	x0, KERNEL_START		// __pa(KERNEL_START)
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 2bbe866417d4..d084c1dcf416 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -38,5 +38,6 @@ $(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init
 $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
-obj-y		:= kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o
-extra-y		:= $(patsubst %.pi.o,%.o,$(obj-y))
+obj-y				:= relocate.pi.o
+obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o
+extra-y				:= $(patsubst %.pi.o,%.o,$(obj-y))
diff --git a/arch/arm64/kernel/pi/relocate.c b/arch/arm64/kernel/pi/relocate.c
new file mode 100644
index 000000000000..1853408ea76b
--- /dev/null
+++ b/arch/arm64/kernel/pi/relocate.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2023 Google LLC
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Peter Collingbourne <pcc@google.com>
+
+#include <linux/elf.h>
+#include <linux/init.h>
+#include <linux/types.h>
+
+extern const Elf64_Rela rela_start[], rela_end[];
+extern const u64 relr_start[], relr_end[];
+
+void __init relocate_kernel(u64 offset)
+{
+	u64 *place = NULL;
+
+	for (const Elf64_Rela *rela = rela_start; rela < rela_end; rela++) {
+		if (ELF64_R_TYPE(rela->r_info) != R_AARCH64_RELATIVE)
+			continue;
+		*(u64 *)(rela->r_offset + offset) = rela->r_addend + offset;
+	}
+
+	if (!IS_ENABLED(CONFIG_RELR) || !offset)
+		return;
+
+	/*
+	 * Apply RELR relocations.
+	 *
+	 * RELR is a compressed format for storing relative relocations. The
+	 * encoded sequence of entries looks like:
+	 * [ AAAAAAAA BBBBBBB1 BBBBBBB1 ... AAAAAAAA BBBBBB1 ... ]
+	 *
+	 * i.e. start with an address, followed by any number of bitmaps. The
+	 * address entry encodes 1 relocation. The subsequent bitmap entries
+	 * encode up to 63 relocations each, at subsequent offsets following
+	 * the last address entry.
+	 *
+	 * The bitmap entries must have 1 in the least significant bit. The
+	 * assumption here is that an address cannot have 1 in lsb. Odd
+	 * addresses are not supported. Any odd addresses are stored in the
+	 * RELA section, which is handled above.
+	 *
+	 * With the exception of the least significant bit, each bit in the
+	 * bitmap corresponds with a machine word that follows the base address
+	 * word, and the bit value indicates whether or not a relocation needs
+	 * to be applied to it. The second least significant bit represents the
+	 * machine word immediately following the initial address, and each bit
+	 * that follows represents the next word, in linear order. As such, a
+	 * single bitmap can encode up to 63 relocations in a 64-bit object.
+	 */
+	for (const u64 *relr = relr_start; relr < relr_end; relr++) {
+		if ((*relr & 1) == 0) {
+			place = (u64 *)(*relr + offset);
+			*place++ += offset;
+		} else {
+			for (u64 *p = place, r = *relr >> 1; r; p++, r >>= 1)
+				if (r & 1)
+					*p += offset;
+			place += 63;
+		}
+	}
+}
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 3cd7e76cc562..8dd5dda66f7c 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -270,15 +270,15 @@ SECTIONS
 	HYPERVISOR_RELOC_SECTION
 
 	.rela.dyn : ALIGN(8) {
-		__rela_start = .;
+		__pi_rela_start = .;
 		*(.rela .rela*)
-		__rela_end = .;
+		__pi_rela_end = .;
 	}
 
 	.relr.dyn : ALIGN(8) {
-		__relr_start = .;
+		__pi_relr_start = .;
 		*(.relr.dyn)
-		__relr_end = .;
+		__pi_relr_end = .;
 	}
 
 	. = ALIGN(SEGMENT_ALIGN);
@@ -317,6 +317,10 @@ SECTIONS
 	init_pg_dir = .;
 	. += INIT_DIR_SIZE;
 	init_pg_end = .;
+#ifdef CONFIG_RELOCATABLE
+	. += SZ_4K;		/* stack for the early relocation code */
+	early_init_stack = .;
+#endif
 
 	. = ALIGN(SEGMENT_ALIGN);
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);

From e223a449125571daa62debd8249fa4fc2da0a961 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:50 +0100
Subject: [PATCH 037/134] arm64: idreg-override: Move to early mini C runtime

We will want to parse the ID register overrides even earlier, so that we
can take them into account before creating the kernel mapping. So
migrate the code and make it work in the context of the early C runtime.
We will move the invocation to an earlier stage in a subsequent patch.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-49-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/Makefile                  |  4 +--
 arch/arm64/kernel/head.S                    |  5 ++-
 arch/arm64/kernel/image-vars.h              |  9 ++++++
 arch/arm64/kernel/pi/Makefile               |  5 +--
 arch/arm64/kernel/{ => pi}/idreg-override.c | 35 +++++++++------------
 5 files changed, 30 insertions(+), 28 deletions(-)
 rename arch/arm64/kernel/{ => pi}/idreg-override.c (93%)

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 78f14084f6d7..4236f1e0fffa 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -33,8 +33,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o idreg-override.o idle.o	\
-			   patching.o
+			   syscall.o proton-pack.o idle.o patching.o pi/
 
 obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
@@ -57,7 +56,6 @@ obj-$(CONFIG_ACPI)			+= acpi.o
 obj-$(CONFIG_ACPI_NUMA)			+= acpi_numa.o
 obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)	+= acpi_parking_protocol.o
 obj-$(CONFIG_PARAVIRT)			+= paravirt.o
-obj-$(CONFIG_RELOCATABLE)		+= pi/
 obj-$(CONFIG_RANDOMIZE_BASE)		+= kaslr.o
 obj-$(CONFIG_HIBERNATION)		+= hibernate.o hibernate-asm.o
 obj-$(CONFIG_ELF_CORE)			+= elfcore.o
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index a8fa64fc30d7..ca5e5fbefcd3 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -510,10 +510,9 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
 #endif
-	mov	x0, x21				// pass FDT address in x0
-	bl	early_fdt_map			// Try mapping the FDT early
 	mov	x0, x20				// pass the full boot status
-	bl	init_feature_override		// Parse cpu feature overrides
+	mov	x1, x22				// pass the low FDT mapping
+	bl	__pi_init_feature_override	// Parse cpu feature overrides
 #ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS
 	bl	scs_patch_vmlinux
 #endif
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index e931ce078a00..eacc3d167733 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -37,6 +37,15 @@ PROVIDE(__pi___memmove			= __pi_memmove);
 PROVIDE(__pi___memset			= __pi_memset);
 
 PROVIDE(__pi_vabits_actual		= vabits_actual);
+PROVIDE(__pi_id_aa64isar1_override	= id_aa64isar1_override);
+PROVIDE(__pi_id_aa64isar2_override	= id_aa64isar2_override);
+PROVIDE(__pi_id_aa64mmfr1_override	= id_aa64mmfr1_override);
+PROVIDE(__pi_id_aa64pfr0_override	= id_aa64pfr0_override);
+PROVIDE(__pi_id_aa64pfr1_override	= id_aa64pfr1_override);
+PROVIDE(__pi_id_aa64smfr0_override	= id_aa64smfr0_override);
+PROVIDE(__pi_id_aa64zfr0_override	= id_aa64zfr0_override);
+PROVIDE(__pi_arm64_sw_feature_override	= arm64_sw_feature_override);
+PROVIDE(__pi__ctype			= _ctype);
 
 #ifdef CONFIG_KVM
 
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index d084c1dcf416..7f6dfce893c3 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -38,6 +38,7 @@ $(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init
 $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
-obj-y				:= relocate.pi.o
-obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o
+obj-y				:= idreg-override.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o
+obj-$(CONFIG_RELOCATABLE)	+= relocate.pi.o
+obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr_early.pi.o
 extra-y				:= $(patsubst %.pi.o,%.o,$(obj-y))
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
similarity index 93%
rename from arch/arm64/kernel/idreg-override.c
rename to arch/arm64/kernel/pi/idreg-override.c
index e30fd9e32ef3..f9e05c10faab 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -14,6 +14,8 @@
 #include <asm/cpufeature.h>
 #include <asm/setup.h>
 
+#include "pi.h"
+
 #define FTR_DESC_NAME_LEN	20
 #define FTR_DESC_FIELD_LEN	10
 #define FTR_ALIAS_NAME_LEN	30
@@ -21,15 +23,6 @@
 
 static u64 __boot_status __initdata;
 
-// temporary __prel64 related definitions
-// to be removed when this code is moved under pi/
-
-#define __prel64_initconst	__initconst
-
-#define PREL64(type, name)	union { type *name; }
-
-#define prel64_pointer(__d)	(__d)
-
 typedef bool filter_t(u64 val);
 
 struct ftr_set_desc {
@@ -313,16 +306,11 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases)
 	} while (1);
 }
 
-static __init const u8 *get_bootargs_cmdline(void)
+static __init const u8 *get_bootargs_cmdline(const void *fdt)
 {
 	const u8 *prop;
-	void *fdt;
 	int node;
 
-	fdt = get_early_fdt_ptr();
-	if (!fdt)
-		return NULL;
-
 	node = fdt_path_offset(fdt, "/chosen");
 	if (node < 0)
 		return NULL;
@@ -334,9 +322,9 @@ static __init const u8 *get_bootargs_cmdline(void)
 	return strlen(prop) ? prop : NULL;
 }
 
-static __init void parse_cmdline(void)
+static __init void parse_cmdline(const void *fdt)
 {
-	const u8 *prop = get_bootargs_cmdline();
+	const u8 *prop = get_bootargs_cmdline(fdt);
 
 	if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop)
 		__parse_cmdline(CONFIG_CMDLINE, true);
@@ -346,9 +334,9 @@ static __init void parse_cmdline(void)
 }
 
 /* Keep checkers quiet */
-void init_feature_override(u64 boot_status);
+void init_feature_override(u64 boot_status, const void *fdt);
 
-asmlinkage void __init init_feature_override(u64 boot_status)
+asmlinkage void __init init_feature_override(u64 boot_status, const void *fdt)
 {
 	struct arm64_ftr_override *override;
 	const struct ftr_set_desc *reg;
@@ -364,7 +352,7 @@ asmlinkage void __init init_feature_override(u64 boot_status)
 
 	__boot_status = boot_status;
 
-	parse_cmdline();
+	parse_cmdline(fdt);
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
 		reg = prel64_pointer(regs[i].reg);
@@ -373,3 +361,10 @@ asmlinkage void __init init_feature_override(u64 boot_status)
 				       (unsigned long)(override + 1));
 	}
 }
+
+char * __init skip_spaces(const char *str)
+{
+	while (isspace(*str))
+		++str;
+	return (char *)str;
+}

From 9c4cd2a7d12c2c5b11efe7831b54e46c73eb3a8c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:51 +0100
Subject: [PATCH 038/134] arm64: kernel: Remove early fdt remap code

The early FDT remap code is no longer used so let's drop it.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-50-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/setup.h |  3 ---
 arch/arm64/kernel/setup.c      | 15 ---------------
 2 files changed, 18 deletions(-)

diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h
index 2e4d7da74fb8..ba269a7a3201 100644
--- a/arch/arm64/include/asm/setup.h
+++ b/arch/arm64/include/asm/setup.h
@@ -7,9 +7,6 @@
 
 #include <uapi/asm/setup.h>
 
-void *get_early_fdt_ptr(void);
-void early_fdt_map(u64 dt_phys);
-
 /*
  * These two variables are used in the head.S file.
  */
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 42c690bb2d60..97d2143669cf 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -166,21 +166,6 @@ static void __init smp_build_mpidr_hash(void)
 		pr_warn("Large number of MPIDR hash buckets detected\n");
 }
 
-static void *early_fdt_ptr __initdata;
-
-void __init *get_early_fdt_ptr(void)
-{
-	return early_fdt_ptr;
-}
-
-asmlinkage void __init early_fdt_map(u64 dt_phys)
-{
-	int fdt_size;
-
-	early_fixmap_init();
-	early_fdt_ptr = fixmap_remap_fdt(dt_phys, &fdt_size, PAGE_KERNEL);
-}
-
 static void __init setup_machine_fdt(phys_addr_t dt_phys)
 {
 	int size;

From aa99aad798a8bc6d35ada2af1dc38f75d364e1ce Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:52 +0100
Subject: [PATCH 039/134] arm64: head: Clear BSS and the kernel page tables in
 one go

We will move the CPU feature overrides into BSS in a subsequent patch,
and this requires that BSS is zeroed before the feature override
detection code runs. So let's map BSS read-write in the ID map, and zero
it via this mapping.

Since the kernel page tables are right next to it, and also zeroed via
the ID map, let's drop the separate clear_page_tables() function, and
just zero everything in one go.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-51-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/head.S        | 33 +++++++++++----------------------
 arch/arm64/kernel/vmlinux.lds.S |  3 +++
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index ca5e5fbefcd3..2af518161f3a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -177,17 +177,6 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 	ret
 SYM_CODE_END(preserve_boot_args)
 
-SYM_FUNC_START_LOCAL(clear_page_tables)
-	/*
-	 * Clear the init page tables.
-	 */
-	adrp	x0, init_pg_dir
-	adrp	x1, init_pg_end
-	sub	x2, x1, x0
-	mov	x1, xzr
-	b	__pi_memset			// tail call
-SYM_FUNC_END(clear_page_tables)
-
 /*
  * Macro to populate page table entries, these entries can be pointers to the next level
  * or last level entries pointing to physical memory.
@@ -386,9 +375,9 @@ SYM_FUNC_START_LOCAL(create_idmap)
 
 	map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT
 
-	/* Remap the kernel page tables r/w in the ID map */
+	/* Remap BSS and the kernel page tables r/w in the ID map */
 	adrp	x1, _text
-	adrp	x2, init_pg_dir
+	adrp	x2, __bss_start
 	adrp	x3, _end
 	bic	x4, x2, #SWAPPER_BLOCK_SIZE - 1
 	mov_q	x5, SWAPPER_RW_MMUFLAGS
@@ -489,14 +478,6 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	mov	x0, x20
 	bl	set_cpu_boot_mode_flag
 
-	// Clear BSS
-	adr_l	x0, __bss_start
-	mov	x1, xzr
-	adr_l	x2, __bss_stop
-	sub	x2, x2, x0
-	bl	__pi_memset
-	dsb	ishst				// Make zero page visible to PTW
-
 #if VA_BITS > 48
 	adr_l	x8, vabits_actual		// Set this early so KASAN early init
 	str	x25, [x8]			// ... observes the correct value
@@ -782,6 +763,15 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	adrp	x1, reserved_pg_dir
 	adrp	x2, init_idmap_pg_dir
 	bl	__enable_mmu
+
+	// Clear BSS
+	adrp	x0, __bss_start
+	mov	x1, xzr
+	adrp	x2, init_pg_end
+	sub	x2, x2, x0
+	bl	__pi_memset
+	dsb	ishst				// Make zero page visible to PTW
+
 #ifdef CONFIG_RELOCATABLE
 	adrp	x23, KERNEL_START
 	and	x23, x23, MIN_KIMG_ALIGN - 1
@@ -796,7 +786,6 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	orr	x23, x23, x0			// record kernel offset
 #endif
 #endif
-	bl	clear_page_tables
 	bl	create_kernel_mapping
 
 	adrp	x1, init_pg_dir
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 8dd5dda66f7c..8a3c6aacc355 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -311,12 +311,15 @@ SECTIONS
 	__pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
 	_edata = .;
 
+	/* start of zero-init region */
 	BSS_SECTION(SBSS_ALIGN, 0, 0)
 
 	. = ALIGN(PAGE_SIZE);
 	init_pg_dir = .;
 	. += INIT_DIR_SIZE;
 	init_pg_end = .;
+	/* end of zero-init region */
+
 #ifdef CONFIG_RELOCATABLE
 	. += SZ_4K;		/* stack for the early relocation code */
 	early_init_stack = .;

From 30687dec5ed5576c743a4cd012a91f93848fe902 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:53 +0100
Subject: [PATCH 040/134] arm64: Move feature overrides into the BSS section

In order to allow the CPU feature override detection code to run even
earlier, move the feature override global variables into BSS, which is
the only part of the static kernel image that is mapped read-write in
the initial ID map.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-52-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8d1a634a403e..a99ad79adee2 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -655,13 +655,13 @@ static const struct arm64_ftr_bits ftr_raz[] = {
 #define ARM64_FTR_REG(id, table)		\
 	__ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override)
 
-struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override;
-struct arm64_ftr_override __ro_after_init id_aa64pfr0_override;
-struct arm64_ftr_override __ro_after_init id_aa64pfr1_override;
-struct arm64_ftr_override __ro_after_init id_aa64zfr0_override;
-struct arm64_ftr_override __ro_after_init id_aa64smfr0_override;
-struct arm64_ftr_override __ro_after_init id_aa64isar1_override;
-struct arm64_ftr_override __ro_after_init id_aa64isar2_override;
+struct arm64_ftr_override id_aa64mmfr1_override;
+struct arm64_ftr_override id_aa64pfr0_override;
+struct arm64_ftr_override id_aa64pfr1_override;
+struct arm64_ftr_override id_aa64zfr0_override;
+struct arm64_ftr_override id_aa64smfr0_override;
+struct arm64_ftr_override id_aa64isar1_override;
+struct arm64_ftr_override id_aa64isar2_override;
 
 struct arm64_ftr_override arm64_sw_feature_override;
 

From dcfe969a641984fcd2b52aa257e478443612c050 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:54 +0100
Subject: [PATCH 041/134] arm64: head: Run feature override detection before
 mapping the kernel

To permit the feature overrides to be taken into account before the
KASLR init code runs and the kernel mapping is created, move the
detection code to an earlier stage in the boot.

In a subsequent patch, this will be taken advantage of by merging the
preliminary and permanent mappings of the kernel text and data into a
single one that gets created and relocated before start_kernel() is
called.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-53-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/head.S        | 17 +++++++++--------
 arch/arm64/kernel/vmlinux.lds.S |  4 +---
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 2af518161f3a..865ecc1f8255 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -375,9 +375,9 @@ SYM_FUNC_START_LOCAL(create_idmap)
 
 	map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT
 
-	/* Remap BSS and the kernel page tables r/w in the ID map */
+	/* Remap [.init].data, BSS and the kernel page tables r/w in the ID map */
 	adrp	x1, _text
-	adrp	x2, __bss_start
+	adrp	x2, __initdata_begin
 	adrp	x3, _end
 	bic	x4, x2, #SWAPPER_BLOCK_SIZE - 1
 	mov_q	x5, SWAPPER_RW_MMUFLAGS
@@ -491,9 +491,6 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
 #endif
-	mov	x0, x20				// pass the full boot status
-	mov	x1, x22				// pass the low FDT mapping
-	bl	__pi_init_feature_override	// Parse cpu feature overrides
 #ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS
 	bl	scs_patch_vmlinux
 #endif
@@ -772,12 +769,16 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	bl	__pi_memset
 	dsb	ishst				// Make zero page visible to PTW
 
-#ifdef CONFIG_RELOCATABLE
-	adrp	x23, KERNEL_START
-	and	x23, x23, MIN_KIMG_ALIGN - 1
 	adrp	x1, early_init_stack
 	mov	sp, x1
 	mov	x29, xzr
+	mov	x0, x20				// pass the full boot status
+	mov	x1, x22				// pass the low FDT mapping
+	bl	__pi_init_feature_override	// Parse cpu feature overrides
+
+#ifdef CONFIG_RELOCATABLE
+	adrp	x23, KERNEL_START
+	and	x23, x23, MIN_KIMG_ALIGN - 1
 #ifdef CONFIG_RANDOMIZE_BASE
 	mov	x0, x22
 	bl	__pi_kaslr_early_init
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 8a3c6aacc355..3afb4223a5e8 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -320,10 +320,8 @@ SECTIONS
 	init_pg_end = .;
 	/* end of zero-init region */
 
-#ifdef CONFIG_RELOCATABLE
-	. += SZ_4K;		/* stack for the early relocation code */
+	. += SZ_4K;		/* stack for the early C runtime */
 	early_init_stack = .;
-#endif
 
 	. = ALIGN(SEGMENT_ALIGN);
 	__pecoff_data_size = ABSOLUTE(. - __initdata_begin);

From 8a6e40e1f68e9fc44497db88e0c0f21bb513c551 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:55 +0100
Subject: [PATCH 042/134] arm64: head: move dynamic shadow call stack patching
 into early C runtime

Once we update the early kernel mapping code to only map the kernel once
with the right permissions, we can no longer perform code patching via
this mapping.

So move this code to an earlier stage of the boot, right after applying
the relocations.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-54-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/scs.h           |  4 ++--
 arch/arm64/kernel/Makefile             |  8 --------
 arch/arm64/kernel/head.S               |  8 +++++---
 arch/arm64/kernel/module.c             |  2 +-
 arch/arm64/kernel/pi/Makefile          | 10 ++++++----
 arch/arm64/kernel/{ => pi}/patch-scs.c | 26 +++++++++++++-------------
 6 files changed, 27 insertions(+), 31 deletions(-)
 rename arch/arm64/kernel/{ => pi}/patch-scs.c (91%)

diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index 3fdae5fe3142..eca2ba5a6276 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -72,8 +72,8 @@ static inline void dynamic_scs_init(void)
 static inline void dynamic_scs_init(void) {}
 #endif
 
-int scs_patch(const u8 eh_frame[], int size);
-asmlinkage void scs_patch_vmlinux(void);
+int __pi_scs_patch(const u8 eh_frame[], int size);
+asmlinkage void __pi_scs_patch_vmlinux(void);
 
 #endif /* __ASSEMBLY __ */
 
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 4236f1e0fffa..14b4a179bad3 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -71,14 +71,6 @@ obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
 obj-$(CONFIG_ARM64_MTE)			+= mte.o
 obj-y					+= vdso-wrap.o
 obj-$(CONFIG_COMPAT_VDSO)		+= vdso32-wrap.o
-obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS)	+= patch-scs.o
-
-# We need to prevent the SCS patching code from patching itself. Using
-# -mbranch-protection=none here to avoid the patchable PAC opcodes from being
-# generated triggers an issue with full LTO on Clang, which stops emitting PAC
-# instructions altogether. So disable LTO as well for the compilation unit.
-CFLAGS_patch-scs.o			+= -mbranch-protection=none
-CFLAGS_REMOVE_patch-scs.o		+= $(CC_FLAGS_LTO)
 
 # Force dependency (vdso*-wrap.S includes vdso.so through incbin)
 $(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 865ecc1f8255..b320702032a7 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -490,9 +490,6 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #endif
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
-#endif
-#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS
-	bl	scs_patch_vmlinux
 #endif
 	mov	x0, x20
 	bl	finalise_el2			// Prefer VHE if possible
@@ -794,6 +791,11 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 #ifdef CONFIG_RELOCATABLE
 	mov	x0, x23
 	bl	__pi_relocate_kernel
+#endif
+#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS
+	ldr	x0, =__eh_frame_start
+	ldr	x1, =__eh_frame_end
+	bl	__pi_scs_patch_vmlinux
 #endif
 	ldr	x8, =__primary_switched
 	adrp	x0, KERNEL_START		// __pa(KERNEL_START)
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index dd851297596e..47e0be610bb6 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -595,7 +595,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 	if (scs_is_dynamic()) {
 		s = find_section(hdr, sechdrs, ".init.eh_frame");
 		if (s)
-			scs_patch((void *)s->sh_addr, s->sh_size);
+			__pi_scs_patch((void *)s->sh_addr, s->sh_size);
 	}
 
 	return module_init_ftrace_plt(hdr, sechdrs, me);
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 7f6dfce893c3..a8b302245f15 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -38,7 +38,9 @@ $(obj)/lib-%.pi.o: OBJCOPYFLAGS += --prefix-alloc-sections=.init
 $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
-obj-y				:= idreg-override.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o
-obj-$(CONFIG_RELOCATABLE)	+= relocate.pi.o
-obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr_early.pi.o
-extra-y				:= $(patsubst %.pi.o,%.o,$(obj-y))
+obj-y					:= idreg-override.pi.o \
+					   lib-fdt.pi.o lib-fdt_ro.pi.o
+obj-$(CONFIG_RELOCATABLE)		+= relocate.pi.o
+obj-$(CONFIG_RANDOMIZE_BASE)		+= kaslr_early.pi.o
+obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS)	+= patch-scs.pi.o
+extra-y					:= $(patsubst %.pi.o,%.o,$(obj-y))
diff --git a/arch/arm64/kernel/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c
similarity index 91%
rename from arch/arm64/kernel/patch-scs.c
rename to arch/arm64/kernel/pi/patch-scs.c
index a1fe4b4ff591..c65ef40d1e6b 100644
--- a/arch/arm64/kernel/patch-scs.c
+++ b/arch/arm64/kernel/pi/patch-scs.c
@@ -4,14 +4,11 @@
  * Author: Ard Biesheuvel <ardb@google.com>
  */
 
-#include <linux/bug.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/linkage.h>
-#include <linux/printk.h>
 #include <linux/types.h>
 
-#include <asm/cacheflush.h>
 #include <asm/scs.h>
 
 //
@@ -81,7 +78,11 @@ static void __always_inline scs_patch_loc(u64 loc)
 		 */
 		return;
 	}
-	dcache_clean_pou(loc, loc + sizeof(u32));
+	if (IS_ENABLED(CONFIG_ARM64_WORKAROUND_CLEAN_CACHE))
+		asm("dc civac, %0" :: "r"(loc));
+	else
+		asm(ALTERNATIVE("dc cvau, %0", "nop", ARM64_HAS_CACHE_IDC)
+		    :: "r"(loc));
 }
 
 /*
@@ -128,10 +129,10 @@ struct eh_frame {
 	};
 };
 
-static int noinstr scs_handle_fde_frame(const struct eh_frame *frame,
-					bool fde_has_augmentation_data,
-					int code_alignment_factor,
-					bool dry_run)
+static int scs_handle_fde_frame(const struct eh_frame *frame,
+				bool fde_has_augmentation_data,
+				int code_alignment_factor,
+				bool dry_run)
 {
 	int size = frame->size - offsetof(struct eh_frame, opcodes) + 4;
 	u64 loc = (u64)offset_to_ptr(&frame->initial_loc);
@@ -198,14 +199,13 @@ static int noinstr scs_handle_fde_frame(const struct eh_frame *frame,
 			break;
 
 		default:
-			pr_err("unhandled opcode: %02x in FDE frame %lx\n", opcode[-1], (uintptr_t)frame);
 			return -ENOEXEC;
 		}
 	}
 	return 0;
 }
 
-int noinstr scs_patch(const u8 eh_frame[], int size)
+int scs_patch(const u8 eh_frame[], int size)
 {
 	const u8 *p = eh_frame;
 
@@ -251,12 +251,12 @@ int noinstr scs_patch(const u8 eh_frame[], int size)
 	return 0;
 }
 
-asmlinkage void __init scs_patch_vmlinux(void)
+asmlinkage void __init scs_patch_vmlinux(const u8 start[], const u8 end[])
 {
 	if (!should_patch_pac_into_scs())
 		return;
 
-	WARN_ON(scs_patch(__eh_frame_start, __eh_frame_end - __eh_frame_start));
-	icache_inval_all_pou();
+	scs_patch(start, end - start);
+	asm("ic ialluis");
 	isb();
 }

From 35876f35f4821c92fb1bbff7eec5780dba4fffdb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:56 +0100
Subject: [PATCH 043/134] arm64: cpufeature: Add helper to test for CPU feature
 overrides

Add some helpers to extract and apply feature overrides to the bare
idreg values. This involves inspecting the value and mask of the
specific field that we are interested in, given that an override
value/mask pair might be invalid for one field but valid for another.

Then, wire up the new helper for the hVHE test - note that we can drop
the sysreg test here, as the override will be invalid when trying to
enable hVHE on non-VHE capable hardware.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-55-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 39 +++++++++++++++++++++++++++++
 arch/arm64/kernel/cpufeature.c      |  9 +------
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 21c824edf8ce..acd8f4949583 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -915,6 +915,45 @@ extern struct arm64_ftr_override id_aa64isar2_override;
 
 extern struct arm64_ftr_override arm64_sw_feature_override;
 
+static inline
+u64 arm64_apply_feature_override(u64 val, int feat, int width,
+				 const struct arm64_ftr_override *override)
+{
+	u64 oval = override->val;
+
+	/*
+	 * When it encounters an invalid override (e.g., an override that
+	 * cannot be honoured due to a missing CPU feature), the early idreg
+	 * override code will set the mask to 0x0 and the value to non-zero for
+	 * the field in question. In order to determine whether the override is
+	 * valid or not for the field we are interested in, we first need to
+	 * disregard bits belonging to other fields.
+	 */
+	oval &= GENMASK_ULL(feat + width - 1, feat);
+
+	/*
+	 * The override is valid if all value bits are accounted for in the
+	 * mask. If so, replace the masked bits with the override value.
+	 */
+	if (oval == (oval & override->mask)) {
+		val &= ~override->mask;
+		val |= oval;
+	}
+
+	/* Extract the field from the updated value */
+	return cpuid_feature_extract_unsigned_field(val, feat);
+}
+
+static inline bool arm64_test_sw_feature_override(int feat)
+{
+	/*
+	 * Software features are pseudo CPU features that have no underlying
+	 * CPUID system register value to apply the override to.
+	 */
+	return arm64_apply_feature_override(0, feat, 4,
+					    &arm64_sw_feature_override);
+}
+
 u32 get_kvm_ipa_limit(void);
 void dump_cpu_features(void);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a99ad79adee2..d0ffb872a31a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2042,14 +2042,7 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap,
 static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
 			  int __unused)
 {
-	u64 val;
-
-	val = read_sysreg(id_aa64mmfr1_el1);
-	if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT))
-		return false;
-
-	val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask;
-	return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE);
+	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
 }
 
 #ifdef CONFIG_ARM64_PAN

From af73b9a2dd39fb458627a325dcdc9c76e274eae0 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:57 +0100
Subject: [PATCH 044/134] arm64: kaslr: Use feature override instead of parsing
 the cmdline again

The early kaslr code open codes the detection of 'nokaslr' on the kernel
command line, and this is no longer necessary now that the feature
detection code, which also looks for the same string, executes before
this code.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-56-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  5 +++
 arch/arm64/kernel/kaslr.c           |  4 +--
 arch/arm64/kernel/pi/kaslr_early.c  | 53 +----------------------------
 3 files changed, 7 insertions(+), 55 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index acd8f4949583..e309255b7f04 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -954,6 +954,11 @@ static inline bool arm64_test_sw_feature_override(int feat)
 					    &arm64_sw_feature_override);
 }
 
+static inline bool kaslr_disabled_cmdline(void)
+{
+	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR);
+}
+
 u32 get_kvm_ipa_limit(void);
 void dump_cpu_features(void);
 
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 12c7f3c8ba76..1da3e25f9d9e 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -16,9 +16,7 @@ bool __ro_after_init __kaslr_is_enabled = false;
 
 void __init kaslr_init(void)
 {
-	if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val &
-						 arm64_sw_feature_override.mask,
-						 ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) {
+	if (kaslr_disabled_cmdline()) {
 		pr_info("KASLR disabled on command line\n");
 		return;
 	}
diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c
index 167081b30a15..f2305e276ec3 100644
--- a/arch/arm64/kernel/pi/kaslr_early.c
+++ b/arch/arm64/kernel/pi/kaslr_early.c
@@ -16,57 +16,6 @@
 #include <asm/memory.h>
 #include <asm/pgtable.h>
 
-/* taken from lib/string.c */
-static char *__init __strstr(const char *s1, const char *s2)
-{
-	size_t l1, l2;
-
-	l2 = strlen(s2);
-	if (!l2)
-		return (char *)s1;
-	l1 = strlen(s1);
-	while (l1 >= l2) {
-		l1--;
-		if (!memcmp(s1, s2, l2))
-			return (char *)s1;
-		s1++;
-	}
-	return NULL;
-}
-static bool __init cmdline_contains_nokaslr(const u8 *cmdline)
-{
-	const u8 *str;
-
-	str = __strstr(cmdline, "nokaslr");
-	return str == cmdline || (str > cmdline && *(str - 1) == ' ');
-}
-
-static bool __init is_kaslr_disabled_cmdline(void *fdt)
-{
-	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
-		int node;
-		const u8 *prop;
-
-		node = fdt_path_offset(fdt, "/chosen");
-		if (node < 0)
-			goto out;
-
-		prop = fdt_getprop(fdt, node, "bootargs", NULL);
-		if (!prop)
-			goto out;
-
-		if (cmdline_contains_nokaslr(prop))
-			return true;
-
-		if (IS_ENABLED(CONFIG_CMDLINE_EXTEND))
-			goto out;
-
-		return false;
-	}
-out:
-	return cmdline_contains_nokaslr(CONFIG_CMDLINE);
-}
-
 static u64 __init get_kaslr_seed(void *fdt)
 {
 	static char const chosen_str[] __initconst = "chosen";
@@ -92,7 +41,7 @@ asmlinkage u64 __init kaslr_early_init(void *fdt)
 {
 	u64 seed, range;
 
-	if (is_kaslr_disabled_cmdline(fdt))
+	if (kaslr_disabled_cmdline())
 		return 0;
 
 	seed = get_kaslr_seed(fdt);

From 9ddd9baa42a01d383d278096a11b200b53ba9470 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:58 +0100
Subject: [PATCH 045/134] arm64: idreg-override: Create a pseudo feature for
 rodata=off

Add rodata=off to the set of kernel command line options that is parsed
early using the CPU feature override detection code, so we can easily
refer to it when creating the kernel mapping.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-57-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h   | 1 +
 arch/arm64/kernel/pi/idreg-override.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index e309255b7f04..03c34242bfc7 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -17,6 +17,7 @@
 
 #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR	0
 #define ARM64_SW_FEATURE_OVERRIDE_HVHE		4
+#define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF	8
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index f9e05c10faab..e4bcabcc6860 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -159,6 +159,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = {
 	.fields		= {
 		FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL),
 		FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter),
+		FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL),
 		{}
 	},
 };
@@ -190,6 +191,7 @@ static const struct {
 	{ "arm64.nomops",		"id_aa64isar2.mops=0" },
 	{ "arm64.nomte",		"id_aa64pfr1.mte=0" },
 	{ "nokaslr",			"arm64_sw.nokaslr=1" },
+	{ "rodata=off",			"arm64_sw.rodataoff=1" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)

From a669c6a4935626786e456b25e9cf2bfbba908f15 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:28:59 +0100
Subject: [PATCH 046/134] arm64: Add helpers to probe local CPU for PAC and BTI
 support

Add some helpers that will be used by the early kernel mapping code to
check feature support on the local CPU. This permits the early kernel
mapping to be created with the right attributes, removing the need for
tearing it down and recreating it.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-58-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 32 +++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 03c34242bfc7..e3edae1825f3 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -963,6 +963,38 @@ static inline bool kaslr_disabled_cmdline(void)
 u32 get_kvm_ipa_limit(void);
 void dump_cpu_features(void);
 
+static inline bool cpu_has_bti(void)
+{
+	if (!IS_ENABLED(CONFIG_ARM64_BTI))
+		return false;
+
+	return arm64_apply_feature_override(read_cpuid(ID_AA64PFR1_EL1),
+					    ID_AA64PFR1_EL1_BT_SHIFT, 4,
+					    &id_aa64pfr1_override);
+}
+
+static inline bool cpu_has_pac(void)
+{
+	u64 isar1, isar2;
+
+	if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH))
+		return false;
+
+	isar1 = read_cpuid(ID_AA64ISAR1_EL1);
+	isar2 = read_cpuid(ID_AA64ISAR2_EL1);
+
+	if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_APA_SHIFT, 4,
+					 &id_aa64isar1_override))
+		return true;
+
+	if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_API_SHIFT, 4,
+					 &id_aa64isar1_override))
+		return true;
+
+	return arm64_apply_feature_override(isar2, ID_AA64ISAR2_EL1_APA3_SHIFT, 4,
+					    &id_aa64isar2_override);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif

From 8d47b8e5c74a1be600fd68bbbb4c2ecd8d4cc33c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:00 +0100
Subject: [PATCH 047/134] arm64: head: allocate more pages for the kernel
 mapping

In preparation for switching to an early kernel mapping routine that
maps each segment according to its precise boundaries, and with the
correct attributes, let's allocate some extra pages for page tables for
the 4k page size configuration. This is necessary because the start and
end of each segment may not be aligned to the block size, and so we'll
need an extra page table at each segment boundary.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-59-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 83ddb14b95a5..0631604995ee 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -68,7 +68,7 @@
 			+ EARLY_PGDS((vstart), (vend), add) 	/* each PGDIR needs a next level page table */	\
 			+ EARLY_PUDS((vstart), (vend), add)	/* each PUD needs a next level page table */	\
 			+ EARLY_PMDS((vstart), (vend), add))	/* each PMD needs a next level page table */
-#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end, EXTRA_PAGE))
+#define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(KIMAGE_VADDR, _end, EXTRA_PAGE) + EARLY_SEGMENT_EXTRA_PAGES))
 
 /* the initial ID map may need two extra pages if it needs to be extended */
 #if VA_BITS < 48
@@ -89,6 +89,15 @@
 #define SWAPPER_TABLE_SHIFT	PMD_SHIFT
 #endif
 
+/* The number of segments in the kernel image (text, rodata, inittext, initdata, data+bss) */
+#define KERNEL_SEGMENT_COUNT	5
+
+#if SWAPPER_BLOCK_SIZE > SEGMENT_ALIGN
+#define EARLY_SEGMENT_EXTRA_PAGES (KERNEL_SEGMENT_COUNT + 1)
+#else
+#define EARLY_SEGMENT_EXTRA_PAGES 0
+#endif
+
 /*
  * Initial memory map attributes.
  */

From aa6a52b2470c375ecd71b1d81c89d93b11134b56 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:01 +0100
Subject: [PATCH 048/134] arm64: head: move memstart_offset_seed handling to C
 code

Now that we can set BSS variables from the early code running from the
ID map, we can set memstart_offset_seed directly from the C code that
derives the value instead of passing it back and forth between C and asm
code.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-60-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/head.S           | 7 -------
 arch/arm64/kernel/image-vars.h     | 1 +
 arch/arm64/kernel/pi/kaslr_early.c | 4 ++++
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index b320702032a7..aa7766dc64d9 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -82,7 +82,6 @@
 	 *  x21        primary_entry() .. start_kernel()        FDT pointer passed at boot in x0
 	 *  x22        create_idmap() .. start_kernel()         ID map VA of the DT blob
 	 *  x23        __primary_switch()                       physical misalignment/KASLR offset
-	 *  x24        __primary_switch()                       linear map KASLR seed
 	 *  x25        primary_entry() .. start_kernel()        supported VA size
 	 *  x28        create_idmap()                           callee preserved temp register
 	 */
@@ -483,11 +482,6 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	str	x25, [x8]			// ... observes the correct value
 	dc	civac, x8			// Make visible to booting secondaries
 #endif
-
-#ifdef CONFIG_RANDOMIZE_BASE
-	adrp	x5, memstart_offset_seed	// Save KASLR linear map seed
-	strh	w24, [x5, :lo12:memstart_offset_seed]
-#endif
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
 #endif
@@ -779,7 +773,6 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 #ifdef CONFIG_RANDOMIZE_BASE
 	mov	x0, x22
 	bl	__pi_kaslr_early_init
-	and	x24, x0, #SZ_2M - 1		// capture memstart offset seed
 	bic	x0, x0, #SZ_2M - 1
 	orr	x23, x23, x0			// record kernel offset
 #endif
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index eacc3d167733..8d96052079e8 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -46,6 +46,7 @@ PROVIDE(__pi_id_aa64smfr0_override	= id_aa64smfr0_override);
 PROVIDE(__pi_id_aa64zfr0_override	= id_aa64zfr0_override);
 PROVIDE(__pi_arm64_sw_feature_override	= arm64_sw_feature_override);
 PROVIDE(__pi__ctype			= _ctype);
+PROVIDE(__pi_memstart_offset_seed	= memstart_offset_seed);
 
 #ifdef CONFIG_KVM
 
diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c
index f2305e276ec3..eeecee7ffd6f 100644
--- a/arch/arm64/kernel/pi/kaslr_early.c
+++ b/arch/arm64/kernel/pi/kaslr_early.c
@@ -16,6 +16,8 @@
 #include <asm/memory.h>
 #include <asm/pgtable.h>
 
+extern u16 memstart_offset_seed;
+
 static u64 __init get_kaslr_seed(void *fdt)
 {
 	static char const chosen_str[] __initconst = "chosen";
@@ -51,6 +53,8 @@ asmlinkage u64 __init kaslr_early_init(void *fdt)
 			return 0;
 	}
 
+	memstart_offset_seed = seed & U16_MAX;
+
 	/*
 	 * OK, so we are proceeding with KASLR enabled. Calculate a suitable
 	 * kernel image offset from the seed. Let's place the kernel in the

From 293d865f0af58e6ff2ff0ba0e890674e00d036b1 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:02 +0100
Subject: [PATCH 049/134] arm64: mm: Make kaslr_requires_kpti() a static inline

In preparation for moving the first assignment of arm64_use_ng_mappings
to an earlier stage in the boot, ensure that kaslr_requires_kpti() is
accessible without relying on the core kernel's view on whether or not
KASLR is enabled. So make it a static inline, and move the
kaslr_enabled() check out of it and into the callers, one of which will
disappear in a subsequent patch.

Once/when support for the obsolete ThunderX 1 platform is dropped, this
check reduces to a E0PD feature check on the local CPU.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-61-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu.h   | 38 +++++++++++++++++++++++++++++-
 arch/arm64/kernel/cpufeature.c | 42 +---------------------------------
 arch/arm64/kernel/setup.c      |  2 +-
 3 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 2fcf51231d6e..d0b8b4b413b6 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -71,7 +71,43 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       pgprot_t prot, bool page_mappings_only);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
 extern void mark_linear_text_alias_ro(void);
-extern bool kaslr_requires_kpti(void);
+
+/*
+ * This check is triggered during the early boot before the cpufeature
+ * is initialised. Checking the status on the local CPU allows the boot
+ * CPU to detect the need for non-global mappings and thus avoiding a
+ * pagetable re-write after all the CPUs are booted. This check will be
+ * anyway run on individual CPUs, allowing us to get the consistent
+ * state once the SMP CPUs are up and thus make the switch to non-global
+ * mappings if required.
+ */
+static inline bool kaslr_requires_kpti(void)
+{
+	/*
+	 * E0PD does a similar job to KPTI so can be used instead
+	 * where available.
+	 */
+	if (IS_ENABLED(CONFIG_ARM64_E0PD)) {
+		u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
+		if (cpuid_feature_extract_unsigned_field(mmfr2,
+						ID_AA64MMFR2_EL1_E0PD_SHIFT))
+			return false;
+	}
+
+	/*
+	 * Systems affected by Cavium erratum 24756 are incompatible
+	 * with KPTI.
+	 */
+	if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
+		extern const struct midr_range cavium_erratum_27456_cpus[];
+
+		if (is_midr_in_range_list(read_cpuid_id(),
+					  cavium_erratum_27456_cpus))
+			return false;
+	}
+
+	return true;
+}
 
 #define INIT_MM_CONTEXT(name)	\
 	.pgd = init_pg_dir,
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d0ffb872a31a..7064cf13f226 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1620,46 +1620,6 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope)
 	return has_cpuid_feature(entry, scope);
 }
 
-/*
- * This check is triggered during the early boot before the cpufeature
- * is initialised. Checking the status on the local CPU allows the boot
- * CPU to detect the need for non-global mappings and thus avoiding a
- * pagetable re-write after all the CPUs are booted. This check will be
- * anyway run on individual CPUs, allowing us to get the consistent
- * state once the SMP CPUs are up and thus make the switch to non-global
- * mappings if required.
- */
-bool kaslr_requires_kpti(void)
-{
-	if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
-		return false;
-
-	/*
-	 * E0PD does a similar job to KPTI so can be used instead
-	 * where available.
-	 */
-	if (IS_ENABLED(CONFIG_ARM64_E0PD)) {
-		u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
-		if (cpuid_feature_extract_unsigned_field(mmfr2,
-						ID_AA64MMFR2_EL1_E0PD_SHIFT))
-			return false;
-	}
-
-	/*
-	 * Systems affected by Cavium erratum 24756 are incompatible
-	 * with KPTI.
-	 */
-	if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
-		extern const struct midr_range cavium_erratum_27456_cpus[];
-
-		if (is_midr_in_range_list(read_cpuid_id(),
-					  cavium_erratum_27456_cpus))
-			return false;
-	}
-
-	return kaslr_enabled();
-}
-
 static bool __meltdown_safe = true;
 static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */
 
@@ -1712,7 +1672,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
 	}
 
 	/* Useful for KASLR robustness */
-	if (kaslr_requires_kpti()) {
+	if (kaslr_enabled() && kaslr_requires_kpti()) {
 		if (!__kpti_forced) {
 			str = "KASLR";
 			__kpti_forced = 1;
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 97d2143669cf..0ef45d1927b3 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -288,7 +288,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	 * mappings from the start, avoiding the cost of rewriting
 	 * everything later.
 	 */
-	arm64_use_ng_mappings = kaslr_requires_kpti();
+	arm64_use_ng_mappings = kaslr_enabled() && kaslr_requires_kpti();
 
 	early_fixmap_init();
 	early_ioremap_init();

From 82ca151da7d54d7571c5d511d016b7780d5d559f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:03 +0100
Subject: [PATCH 050/134] arm64: mmu: Make __cpu_replace_ttbr1() out of line

__cpu_replace_ttbr1() is a static inline, and so it gets instantiated
wherever it is used. This is not really necessary, as it is never called
on a hot path. It also has the unfortunate side effect that the symbol
idmap_cpu_replace_ttbr1 may never be referenced from kCFI enabled C
code, and this means the type id symbol may not exist either.  This will
result in a build error once we start referring to this symbol from asm
code as well. (Note that this problem only occurs when CnP, KAsan and
suspend/resume are all disabled in the Kconfig but that is a valid
config, if unusual).

So let's just move it out of line so all callers will share the same
implementation, which will reference idmap_cpu_replace_ttbr1
unconditionally.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-62-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu_context.h | 32 +---------------------------
 arch/arm64/mm/mmu.c                  | 32 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 9ce4200508b1..926fbbcecbe0 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -148,37 +148,7 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz)
 	isb();
 }
 
-/*
- * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
- * avoiding the possibility of conflicting TLB entries being allocated.
- */
-static inline void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp)
-{
-	typedef void (ttbr_replace_func)(phys_addr_t);
-	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
-	ttbr_replace_func *replace_phys;
-	unsigned long daif;
-
-	/* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
-	phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
-
-	if (cnp)
-		ttbr1 |= TTBR_CNP_BIT;
-
-	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
-
-	__cpu_install_idmap(idmap);
-
-	/*
-	 * We really don't want to take *any* exceptions while TTBR1 is
-	 * in the process of being replaced so mask everything.
-	 */
-	daif = local_daif_save();
-	replace_phys(ttbr1);
-	local_daif_restore(daif);
-
-	cpu_uninstall_idmap();
-}
+void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp);
 
 static inline void cpu_enable_swapper_cnp(void)
 {
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1ac7467d34c9..f9332eea318f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1486,3 +1486,35 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte
 {
 	set_pte_at(vma->vm_mm, addr, ptep, pte);
 }
+
+/*
+ * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
+ * avoiding the possibility of conflicting TLB entries being allocated.
+ */
+void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp)
+{
+	typedef void (ttbr_replace_func)(phys_addr_t);
+	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
+	ttbr_replace_func *replace_phys;
+	unsigned long daif;
+
+	/* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
+	phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
+
+	if (cnp)
+		ttbr1 |= TTBR_CNP_BIT;
+
+	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
+
+	__cpu_install_idmap(idmap);
+
+	/*
+	 * We really don't want to take *any* exceptions while TTBR1 is
+	 * in the process of being replaced so mask everything.
+	 */
+	daif = local_daif_save();
+	replace_phys(ttbr1);
+	local_daif_restore(daif);
+
+	cpu_uninstall_idmap();
+}

From 97a6f43bb049e64b9913c50c7530e13d78e205d4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:04 +0100
Subject: [PATCH 051/134] arm64: head: Move early kernel mapping routines into
 C code

The asm version of the kernel mapping code works fine for creating a
coarse grained identity map, but for mapping the kernel down to its
exact boundaries with the right attributes, it is not suitable. This is
why we create a preliminary RWX kernel mapping first, and then rebuild
it from scratch later on.

So let's reimplement this in C, in a way that will make it unnecessary
to create the kernel page tables yet another time in paging_init().

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-63-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/archrandom.h   |   2 -
 arch/arm64/include/asm/scs.h          |  32 +----
 arch/arm64/kernel/head.S              |  52 +-------
 arch/arm64/kernel/image-vars.h        |  19 +++
 arch/arm64/kernel/pi/Makefile         |   1 +
 arch/arm64/kernel/pi/idreg-override.c |  22 ++--
 arch/arm64/kernel/pi/kaslr_early.c    |  12 +-
 arch/arm64/kernel/pi/map_kernel.c     | 164 ++++++++++++++++++++++++++
 arch/arm64/kernel/pi/map_range.c      |  88 ++++++++++++++
 arch/arm64/kernel/pi/patch-scs.c      |  16 +--
 arch/arm64/kernel/pi/pi.h             |  14 +++
 arch/arm64/kernel/pi/relocate.c       |   2 +
 arch/arm64/kernel/setup.c             |   7 --
 arch/arm64/kernel/vmlinux.lds.S       |   4 +-
 arch/arm64/mm/proc.S                  |   1 +
 15 files changed, 315 insertions(+), 121 deletions(-)
 create mode 100644 arch/arm64/kernel/pi/map_kernel.c
 create mode 100644 arch/arm64/kernel/pi/map_range.c

diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h
index ecdb3cfcd0f8..8babfbe31f95 100644
--- a/arch/arm64/include/asm/archrandom.h
+++ b/arch/arm64/include/asm/archrandom.h
@@ -129,6 +129,4 @@ static inline bool __init __early_cpu_has_rndr(void)
 	return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf;
 }
 
-u64 kaslr_early_init(void *fdt);
-
 #endif /* _ASM_ARCHRANDOM_H */
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index eca2ba5a6276..2e010ea76be2 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -33,37 +33,11 @@
 #include <asm/cpufeature.h>
 
 #ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS
-static inline bool should_patch_pac_into_scs(void)
-{
-	u64 reg;
-
-	/*
-	 * We only enable the shadow call stack dynamically if we are running
-	 * on a system that does not implement PAC or BTI. PAC and SCS provide
-	 * roughly the same level of protection, and BTI relies on the PACIASP
-	 * instructions serving as landing pads, preventing us from patching
-	 * those instructions into something else.
-	 */
-	reg = read_sysreg_s(SYS_ID_AA64ISAR1_EL1);
-	if (SYS_FIELD_GET(ID_AA64ISAR1_EL1, APA, reg) |
-	    SYS_FIELD_GET(ID_AA64ISAR1_EL1, API, reg))
-		return false;
-
-	reg = read_sysreg_s(SYS_ID_AA64ISAR2_EL1);
-	if (SYS_FIELD_GET(ID_AA64ISAR2_EL1, APA3, reg))
-		return false;
-
-	if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) {
-		reg = read_sysreg_s(SYS_ID_AA64PFR1_EL1);
-		if (reg & (0xf << ID_AA64PFR1_EL1_BT_SHIFT))
-			return false;
-	}
-	return true;
-}
-
 static inline void dynamic_scs_init(void)
 {
-	if (should_patch_pac_into_scs()) {
+	extern bool __pi_dynamic_scs_is_enabled;
+
+	if (__pi_dynamic_scs_is_enabled) {
 		pr_info("Enabling dynamic shadow call stack\n");
 		static_branch_enable(&dynamic_scs_enabled);
 	}
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index aa7766dc64d9..ffacce7b5a02 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -81,7 +81,6 @@
 	 *  x20        primary_entry() .. __primary_switch()    CPU boot mode
 	 *  x21        primary_entry() .. start_kernel()        FDT pointer passed at boot in x0
 	 *  x22        create_idmap() .. start_kernel()         ID map VA of the DT blob
-	 *  x23        __primary_switch()                       physical misalignment/KASLR offset
 	 *  x25        primary_entry() .. start_kernel()        supported VA size
 	 *  x28        create_idmap()                           callee preserved temp register
 	 */
@@ -408,24 +407,6 @@ SYM_FUNC_START_LOCAL(create_idmap)
 0:	ret	x28
 SYM_FUNC_END(create_idmap)
 
-SYM_FUNC_START_LOCAL(create_kernel_mapping)
-	adrp	x0, init_pg_dir
-	mov_q	x5, KIMAGE_VADDR		// compile time __va(_text)
-#ifdef CONFIG_RELOCATABLE
-	add	x5, x5, x23			// add KASLR displacement
-#endif
-	adrp	x6, _end			// runtime __pa(_end)
-	adrp	x3, _text			// runtime __pa(_text)
-	sub	x6, x6, x3			// _end - _text
-	add	x6, x6, x5			// runtime __va(_end)
-	mov_q	x7, SWAPPER_RW_MMUFLAGS
-
-	map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14
-
-	dsb	ishst				// sync with page table walker
-	ret
-SYM_FUNC_END(create_kernel_mapping)
-
 	/*
 	 * Initialize CPU registers with task-specific and cpu-specific context.
 	 *
@@ -752,44 +733,13 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	adrp	x2, init_idmap_pg_dir
 	bl	__enable_mmu
 
-	// Clear BSS
-	adrp	x0, __bss_start
-	mov	x1, xzr
-	adrp	x2, init_pg_end
-	sub	x2, x2, x0
-	bl	__pi_memset
-	dsb	ishst				// Make zero page visible to PTW
-
 	adrp	x1, early_init_stack
 	mov	sp, x1
 	mov	x29, xzr
 	mov	x0, x20				// pass the full boot status
 	mov	x1, x22				// pass the low FDT mapping
-	bl	__pi_init_feature_override	// Parse cpu feature overrides
+	bl	__pi_early_map_kernel		// Map and relocate the kernel
 
-#ifdef CONFIG_RELOCATABLE
-	adrp	x23, KERNEL_START
-	and	x23, x23, MIN_KIMG_ALIGN - 1
-#ifdef CONFIG_RANDOMIZE_BASE
-	mov	x0, x22
-	bl	__pi_kaslr_early_init
-	bic	x0, x0, #SZ_2M - 1
-	orr	x23, x23, x0			// record kernel offset
-#endif
-#endif
-	bl	create_kernel_mapping
-
-	adrp	x1, init_pg_dir
-	load_ttbr1 x1, x1, x2
-#ifdef CONFIG_RELOCATABLE
-	mov	x0, x23
-	bl	__pi_relocate_kernel
-#endif
-#ifdef CONFIG_UNWIND_PATCH_PAC_INTO_SCS
-	ldr	x0, =__eh_frame_start
-	ldr	x1, =__eh_frame_end
-	bl	__pi_scs_patch_vmlinux
-#endif
 	ldr	x8, =__primary_switched
 	adrp	x0, KERNEL_START		// __pa(KERNEL_START)
 	br	x8
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8d96052079e8..e566b32f9c22 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -45,9 +45,28 @@ PROVIDE(__pi_id_aa64pfr1_override	= id_aa64pfr1_override);
 PROVIDE(__pi_id_aa64smfr0_override	= id_aa64smfr0_override);
 PROVIDE(__pi_id_aa64zfr0_override	= id_aa64zfr0_override);
 PROVIDE(__pi_arm64_sw_feature_override	= arm64_sw_feature_override);
+PROVIDE(__pi_arm64_use_ng_mappings	= arm64_use_ng_mappings);
+#ifdef CONFIG_CAVIUM_ERRATUM_27456
+PROVIDE(__pi_cavium_erratum_27456_cpus	= cavium_erratum_27456_cpus);
+#endif
 PROVIDE(__pi__ctype			= _ctype);
 PROVIDE(__pi_memstart_offset_seed	= memstart_offset_seed);
 
+PROVIDE(__pi_init_pg_dir		= init_pg_dir);
+PROVIDE(__pi_init_pg_end		= init_pg_end);
+
+PROVIDE(__pi__text			= _text);
+PROVIDE(__pi__stext               	= _stext);
+PROVIDE(__pi__etext               	= _etext);
+PROVIDE(__pi___start_rodata       	= __start_rodata);
+PROVIDE(__pi___inittext_begin     	= __inittext_begin);
+PROVIDE(__pi___inittext_end       	= __inittext_end);
+PROVIDE(__pi___initdata_begin     	= __initdata_begin);
+PROVIDE(__pi___initdata_end       	= __initdata_end);
+PROVIDE(__pi__data                	= _data);
+PROVIDE(__pi___bss_start		= __bss_start);
+PROVIDE(__pi__end			= _end);
+
 #ifdef CONFIG_KVM
 
 /*
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index a8b302245f15..8c2f80a46b93 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -39,6 +39,7 @@ $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
 
 obj-y					:= idreg-override.pi.o \
+					   map_kernel.pi.o map_range.pi.o \
 					   lib-fdt.pi.o lib-fdt_ro.pi.o
 obj-$(CONFIG_RELOCATABLE)		+= relocate.pi.o
 obj-$(CONFIG_RANDOMIZE_BASE)		+= kaslr_early.pi.o
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index e4bcabcc6860..1884bd936c0d 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -308,37 +308,35 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases)
 	} while (1);
 }
 
-static __init const u8 *get_bootargs_cmdline(const void *fdt)
+static __init const u8 *get_bootargs_cmdline(const void *fdt, int node)
 {
+	static char const bootargs[] __initconst = "bootargs";
 	const u8 *prop;
-	int node;
 
-	node = fdt_path_offset(fdt, "/chosen");
 	if (node < 0)
 		return NULL;
 
-	prop = fdt_getprop(fdt, node, "bootargs", NULL);
+	prop = fdt_getprop(fdt, node, bootargs, NULL);
 	if (!prop)
 		return NULL;
 
 	return strlen(prop) ? prop : NULL;
 }
 
-static __init void parse_cmdline(const void *fdt)
+static __init void parse_cmdline(const void *fdt, int chosen)
 {
-	const u8 *prop = get_bootargs_cmdline(fdt);
+	static char const cmdline[] __initconst = CONFIG_CMDLINE;
+	const u8 *prop = get_bootargs_cmdline(fdt, chosen);
 
 	if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop)
-		__parse_cmdline(CONFIG_CMDLINE, true);
+		__parse_cmdline(cmdline, true);
 
 	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop)
 		__parse_cmdline(prop, true);
 }
 
-/* Keep checkers quiet */
-void init_feature_override(u64 boot_status, const void *fdt);
-
-asmlinkage void __init init_feature_override(u64 boot_status, const void *fdt)
+void __init init_feature_override(u64 boot_status, const void *fdt,
+				  int chosen)
 {
 	struct arm64_ftr_override *override;
 	const struct ftr_set_desc *reg;
@@ -354,7 +352,7 @@ asmlinkage void __init init_feature_override(u64 boot_status, const void *fdt)
 
 	__boot_status = boot_status;
 
-	parse_cmdline(fdt);
+	parse_cmdline(fdt, chosen);
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
 		reg = prel64_pointer(regs[i].reg);
diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c
index eeecee7ffd6f..0257b43819db 100644
--- a/arch/arm64/kernel/pi/kaslr_early.c
+++ b/arch/arm64/kernel/pi/kaslr_early.c
@@ -16,17 +16,17 @@
 #include <asm/memory.h>
 #include <asm/pgtable.h>
 
+#include "pi.h"
+
 extern u16 memstart_offset_seed;
 
-static u64 __init get_kaslr_seed(void *fdt)
+static u64 __init get_kaslr_seed(void *fdt, int node)
 {
-	static char const chosen_str[] __initconst = "chosen";
 	static char const seed_str[] __initconst = "kaslr-seed";
-	int node, len;
 	fdt64_t *prop;
 	u64 ret;
+	int len;
 
-	node = fdt_path_offset(fdt, chosen_str);
 	if (node < 0)
 		return 0;
 
@@ -39,14 +39,14 @@ static u64 __init get_kaslr_seed(void *fdt)
 	return ret;
 }
 
-asmlinkage u64 __init kaslr_early_init(void *fdt)
+u64 __init kaslr_early_init(void *fdt, int chosen)
 {
 	u64 seed, range;
 
 	if (kaslr_disabled_cmdline())
 		return 0;
 
-	seed = get_kaslr_seed(fdt);
+	seed = get_kaslr_seed(fdt, chosen);
 	if (!seed) {
 		if (!__early_cpu_has_rndr() ||
 		    !__arm64_rndr((unsigned long *)&seed))
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
new file mode 100644
index 000000000000..f206373b28b0
--- /dev/null
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2023 Google LLC
+// Author: Ard Biesheuvel <ardb@google.com>
+
+#include <linux/init.h>
+#include <linux/libfdt.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+#include <asm/memory.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "pi.h"
+
+extern const u8 __eh_frame_start[], __eh_frame_end[];
+
+extern void idmap_cpu_replace_ttbr1(void *pgdir);
+
+static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset,
+			       void *start, void *end, pgprot_t prot,
+			       bool may_use_cont, int root_level)
+{
+	map_range(pgd, ((u64)start + va_offset) & ~PAGE_OFFSET,
+		  ((u64)end + va_offset) & ~PAGE_OFFSET, (u64)start,
+		  prot, root_level, (pte_t *)pg_dir, may_use_cont, 0);
+}
+
+static void __init unmap_segment(pgd_t *pg_dir, u64 va_offset, void *start,
+				 void *end, int root_level)
+{
+	map_segment(pg_dir, NULL, va_offset, start, end, __pgprot(0),
+		    false, root_level);
+}
+
+static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
+{
+	bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS);
+	bool twopass = IS_ENABLED(CONFIG_RELOCATABLE);
+	u64 pgdp = (u64)init_pg_dir + PAGE_SIZE;
+	pgprot_t text_prot = PAGE_KERNEL_ROX;
+	pgprot_t data_prot = PAGE_KERNEL;
+	pgprot_t prot;
+
+	/*
+	 * External debuggers may need to write directly to the text mapping to
+	 * install SW breakpoints. Allow this (only) when explicitly requested
+	 * with rodata=off.
+	 */
+	if (arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF))
+		text_prot = PAGE_KERNEL_EXEC;
+
+	/*
+	 * We only enable the shadow call stack dynamically if we are running
+	 * on a system that does not implement PAC or BTI. PAC and SCS provide
+	 * roughly the same level of protection, and BTI relies on the PACIASP
+	 * instructions serving as landing pads, preventing us from patching
+	 * those instructions into something else.
+	 */
+	if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && cpu_has_pac())
+		enable_scs = false;
+
+	if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && cpu_has_bti()) {
+		enable_scs = false;
+
+		/*
+		 * If we have a CPU that supports BTI and a kernel built for
+		 * BTI then mark the kernel executable text as guarded pages
+		 * now so we don't have to rewrite the page tables later.
+		 */
+		text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
+	}
+
+	/* Map all code read-write on the first pass if needed */
+	twopass |= enable_scs;
+	prot = twopass ? data_prot : text_prot;
+
+	map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot,
+		    !twopass, root_level);
+	map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata,
+		    __inittext_begin, data_prot, false, root_level);
+	map_segment(init_pg_dir, &pgdp, va_offset, __inittext_begin,
+		    __inittext_end, prot, false, root_level);
+	map_segment(init_pg_dir, &pgdp, va_offset, __initdata_begin,
+		    __initdata_end, data_prot, false, root_level);
+	map_segment(init_pg_dir, &pgdp, va_offset, _data, _end, data_prot,
+		    true, root_level);
+	dsb(ishst);
+
+	idmap_cpu_replace_ttbr1(init_pg_dir);
+
+	if (twopass) {
+		if (IS_ENABLED(CONFIG_RELOCATABLE))
+			relocate_kernel(kaslr_offset);
+
+		if (enable_scs) {
+			scs_patch(__eh_frame_start + va_offset,
+				  __eh_frame_end - __eh_frame_start);
+			asm("ic ialluis");
+
+			dynamic_scs_is_enabled = true;
+		}
+
+		/*
+		 * Unmap the text region before remapping it, to avoid
+		 * potential TLB conflicts when creating the contiguous
+		 * descriptors.
+		 */
+		unmap_segment(init_pg_dir, va_offset, _stext, _etext,
+			      root_level);
+		dsb(ishst);
+		isb();
+		__tlbi(vmalle1);
+		isb();
+
+		/*
+		 * Remap these segments with different permissions
+		 * No new page table allocations should be needed
+		 */
+		map_segment(init_pg_dir, NULL, va_offset, _stext, _etext,
+			    text_prot, true, root_level);
+		map_segment(init_pg_dir, NULL, va_offset, __inittext_begin,
+			    __inittext_end, text_prot, false, root_level);
+		dsb(ishst);
+	}
+}
+
+asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
+{
+	static char const chosen_str[] __initconst = "/chosen";
+	u64 va_base, pa_base = (u64)&_text;
+	u64 kaslr_offset = pa_base % MIN_KIMG_ALIGN;
+	int root_level = 4 - CONFIG_PGTABLE_LEVELS;
+	int chosen;
+
+	/* Clear BSS and the initial page tables */
+	memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start);
+
+	/* Parse the command line for CPU feature overrides */
+	chosen = fdt_path_offset(fdt, chosen_str);
+	init_feature_override(boot_status, fdt, chosen);
+
+	/*
+	 * The virtual KASLR displacement modulo 2MiB is decided by the
+	 * physical placement of the image, as otherwise, we might not be able
+	 * to create the early kernel mapping using 2 MiB block descriptors. So
+	 * take the low bits of the KASLR offset from the physical address, and
+	 * fill in the high bits from the seed.
+	 */
+	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+		u64 kaslr_seed = kaslr_early_init(fdt, chosen);
+
+		if (kaslr_seed && kaslr_requires_kpti())
+			arm64_use_ng_mappings = true;
+
+		kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1);
+	}
+
+	va_base = KIMAGE_VADDR + kaslr_offset;
+	map_kernel(kaslr_offset, va_base - pa_base, root_level);
+}
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
new file mode 100644
index 000000000000..c31feda18f47
--- /dev/null
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2023 Google LLC
+// Author: Ard Biesheuvel <ardb@google.com>
+
+#include <linux/types.h>
+#include <linux/sizes.h>
+
+#include <asm/memory.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+
+#include "pi.h"
+
+/**
+ * map_range - Map a contiguous range of physical pages into virtual memory
+ *
+ * @pte:		Address of physical pointer to array of pages to
+ *			allocate page tables from
+ * @start:		Virtual address of the start of the range
+ * @end:		Virtual address of the end of the range (exclusive)
+ * @pa:			Physical address of the start of the range
+ * @prot:		Access permissions of the range
+ * @level:		Translation level for the mapping
+ * @tbl:		The level @level page table to create the mappings in
+ * @may_use_cont:	Whether the use of the contiguous attribute is allowed
+ * @va_offset:		Offset between a physical page and its current mapping
+ * 			in the VA space
+ */
+void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
+		      int level, pte_t *tbl, bool may_use_cont, u64 va_offset)
+{
+	u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX;
+	u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
+	int lshift = (3 - level) * (PAGE_SHIFT - 3);
+	u64 lmask = (PAGE_SIZE << lshift) - 1;
+
+	start	&= PAGE_MASK;
+	pa	&= PAGE_MASK;
+
+	/* Advance tbl to the entry that covers start */
+	tbl += (start >> (lshift + PAGE_SHIFT)) % PTRS_PER_PTE;
+
+	/*
+	 * Set the right block/page bits for this level unless we are
+	 * clearing the mapping
+	 */
+	if (protval)
+		protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE;
+
+	while (start < end) {
+		u64 next = min((start | lmask) + 1, PAGE_ALIGN(end));
+
+		if (level < 3 && (start | next | pa) & lmask) {
+			/*
+			 * This chunk needs a finer grained mapping. Create a
+			 * table mapping if necessary and recurse.
+			 */
+			if (pte_none(*tbl)) {
+				*tbl = __pte(__phys_to_pte_val(*pte) |
+					     PMD_TYPE_TABLE | PMD_TABLE_UXN);
+				*pte += PTRS_PER_PTE * sizeof(pte_t);
+			}
+			map_range(pte, start, next, pa, prot, level + 1,
+				  (pte_t *)(__pte_to_phys(*tbl) + va_offset),
+				  may_use_cont, va_offset);
+		} else {
+			/*
+			 * Start a contiguous range if start and pa are
+			 * suitably aligned
+			 */
+			if (((start | pa) & cmask) == 0 && may_use_cont)
+				protval |= PTE_CONT;
+
+			/*
+			 * Clear the contiguous attribute if the remaining
+			 * range does not cover a contiguous block
+			 */
+			if ((end & ~cmask) <= start)
+				protval &= ~PTE_CONT;
+
+			/* Put down a block or page mapping */
+			*tbl = __pte(__phys_to_pte_val(pa) | protval);
+		}
+		pa += next - start;
+		start = next;
+		tbl++;
+	}
+}
diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c
index c65ef40d1e6b..49d8b40e61bc 100644
--- a/arch/arm64/kernel/pi/patch-scs.c
+++ b/arch/arm64/kernel/pi/patch-scs.c
@@ -11,6 +11,10 @@
 
 #include <asm/scs.h>
 
+#include "pi.h"
+
+bool dynamic_scs_is_enabled;
+
 //
 // This minimal DWARF CFI parser is partially based on the code in
 // arch/arc/kernel/unwind.c, and on the document below:
@@ -46,8 +50,6 @@
 #define DW_CFA_GNU_negative_offset_extended 0x2f
 #define DW_CFA_hi_user                      0x3f
 
-extern const u8 __eh_frame_start[], __eh_frame_end[];
-
 enum {
 	PACIASP		= 0xd503233f,
 	AUTIASP		= 0xd50323bf,
@@ -250,13 +252,3 @@ int scs_patch(const u8 eh_frame[], int size)
 	}
 	return 0;
 }
-
-asmlinkage void __init scs_patch_vmlinux(const u8 start[], const u8 end[])
-{
-	if (!should_patch_pac_into_scs())
-		return;
-
-	scs_patch(start, end - start);
-	asm("ic ialluis");
-	isb();
-}
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
index 7c2d9bbf0ff9..d307c58e9741 100644
--- a/arch/arm64/kernel/pi/pi.h
+++ b/arch/arm64/kernel/pi/pi.h
@@ -2,6 +2,8 @@
 // Copyright 2023 Google LLC
 // Author: Ard Biesheuvel <ardb@google.com>
 
+#include <linux/types.h>
+
 #define __prel64_initconst	__section(".init.rodata.prel64")
 
 #define PREL64(type, name)	union { type *name; prel64_t name ## _prel; }
@@ -16,3 +18,15 @@ static inline void *prel64_to_pointer(const prel64_t *offset)
 		return NULL;
 	return (void *)offset + *offset;
 }
+
+extern bool dynamic_scs_is_enabled;
+
+void init_feature_override(u64 boot_status, const void *fdt, int chosen);
+u64 kaslr_early_init(void *fdt, int chosen);
+void relocate_kernel(u64 offset);
+int scs_patch(const u8 eh_frame[], int size);
+
+void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
+	       int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+
+asmlinkage void early_map_kernel(u64 boot_status, void *fdt);
diff --git a/arch/arm64/kernel/pi/relocate.c b/arch/arm64/kernel/pi/relocate.c
index 1853408ea76b..2407d2696398 100644
--- a/arch/arm64/kernel/pi/relocate.c
+++ b/arch/arm64/kernel/pi/relocate.c
@@ -7,6 +7,8 @@
 #include <linux/init.h>
 #include <linux/types.h>
 
+#include "pi.h"
+
 extern const Elf64_Rela rela_start[], rela_end[];
 extern const u64 relr_start[], relr_end[];
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 0ef45d1927b3..0ea45b6d0177 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -283,13 +283,6 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 
 	kaslr_init();
 
-	/*
-	 * If know now we are going to need KPTI then use non-global
-	 * mappings from the start, avoiding the cost of rewriting
-	 * everything later.
-	 */
-	arm64_use_ng_mappings = kaslr_enabled() && kaslr_requires_kpti();
-
 	early_fixmap_init();
 	early_ioremap_init();
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 3afb4223a5e8..755a22d4f840 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -126,9 +126,9 @@ jiffies = jiffies_64;
 #ifdef CONFIG_UNWIND_TABLES
 #define UNWIND_DATA_SECTIONS				\
 	.eh_frame : {					\
-		__eh_frame_start = .;			\
+		__pi___eh_frame_start = .;		\
 		*(.eh_frame)				\
-		__eh_frame_end = .;			\
+		__pi___eh_frame_end = .;		\
 	}
 #else
 #define UNWIND_DATA_SECTIONS
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index f66c37a1610e..7c1bdaf25408 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -195,6 +195,7 @@ SYM_TYPED_FUNC_START(idmap_cpu_replace_ttbr1)
 
 	ret
 SYM_FUNC_END(idmap_cpu_replace_ttbr1)
+SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
 	.popsection
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0

From e6128a8e523cfa8f46a1501e6432c37523b47bdf Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:05 +0100
Subject: [PATCH 052/134] arm64: mm: Use 48-bit virtual addressing for the
 permanent ID map

Even though we support loading kernels anywhere in 48-bit addressable
physical memory, we create the ID maps based on the number of levels
that we happened to configure for the kernel VA and user VA spaces.

The reason for this is that the PGD/PUD/PMD based classification of
translation levels, along with the associated folding when the number of
levels is less than 5, does not permit creating a page table hierarchy
of a set number of levels. This means that, for instance, on 39-bit VA
kernels we need to configure an additional level above PGD level on the
fly, and 36-bit VA kernels still only support 47-bit virtual addressing
with this trick applied.

Now that we have a separate helper to populate page table hierarchies
that does not define the levels in terms of PUDS/PMDS/etc at all, let's
reuse it to create the permanent ID map with a fixed VA size of 48 bits.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-64-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h |  3 +++
 arch/arm64/kernel/head.S                |  5 +++++
 arch/arm64/kvm/mmu.c                    | 15 ++++---------
 arch/arm64/mm/mmu.c                     | 30 +++++++++++++------------
 arch/arm64/mm/proc.S                    |  9 +++-----
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 0631604995ee..742a4b2778f7 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -35,6 +35,9 @@
 #define SWAPPER_PGTABLE_LEVELS	(CONFIG_PGTABLE_LEVELS)
 #endif
 
+#define IDMAP_VA_BITS		48
+#define IDMAP_LEVELS		ARM64_HW_PGTABLE_LEVELS(IDMAP_VA_BITS)
+#define IDMAP_ROOT_LEVEL	(4 - IDMAP_LEVELS)
 
 /*
  * A relocatable kernel may execute from an address that differs from the one at
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index ffacce7b5a02..a1c29d64e875 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -729,6 +729,11 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
 SYM_FUNC_END(__no_granule_support)
 
 SYM_FUNC_START_LOCAL(__primary_switch)
+	mrs		x1, tcr_el1
+	mov		x2, #64 - VA_BITS
+	tcr_set_t0sz	x1, x2
+	msr		tcr_el1, x1
+
 	adrp	x1, reserved_pg_dir
 	adrp	x2, init_idmap_pg_dir
 	bl	__enable_mmu
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d14504821b79..6fa9e816df40 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1874,16 +1874,9 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
 	/*
-	 * The ID map may be configured to use an extended virtual address
-	 * range. This is only the case if system RAM is out of range for the
-	 * currently configured page size and VA_BITS_MIN, in which case we will
-	 * also need the extended virtual range for the HYP ID map, or we won't
-	 * be able to enable the EL2 MMU.
-	 *
-	 * However, in some cases the ID map may be configured for fewer than
-	 * the number of VA bits used by the regular kernel stage 1. This
-	 * happens when VA_BITS=52 and the kernel image is placed in PA space
-	 * below 48 bits.
+	 * The ID map is always configured for 48 bits of translation, which
+	 * may be fewer than the number of VA bits used by the regular kernel
+	 * stage 1, when VA_BITS=52.
 	 *
 	 * At EL2, there is only one TTBR register, and we can't switch between
 	 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
@@ -1894,7 +1887,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
 	 * 1 VA bits to assure that the hypervisor can both ID map its code page
 	 * and map any kernel memory.
 	 */
-	idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+	idmap_bits = IDMAP_VA_BITS;
 	kernel_bits = vabits_actual;
 	*hyp_va_bits = max(idmap_bits, kernel_bits);
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f9332eea318f..a991f195592b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -757,22 +757,21 @@ static void __init map_kernel(pgd_t *pgdp)
 	kasan_copy_shadow(pgdp);
 }
 
+void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
+		    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+
+static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
+	  kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
+
 static void __init create_idmap(void)
 {
 	u64 start = __pa_symbol(__idmap_text_start);
-	u64 size = __pa_symbol(__idmap_text_end) - start;
-	pgd_t *pgd = idmap_pg_dir;
-	u64 pgd_phys;
+	u64 end   = __pa_symbol(__idmap_text_end);
+	u64 ptep  = __pa_symbol(idmap_ptes);
 
-	/* check if we need an additional level of translation */
-	if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
-		pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
-		set_pgd(&idmap_pg_dir[start >> VA_BITS],
-			__pgd(pgd_phys | P4D_TYPE_TABLE));
-		pgd = __va(pgd_phys);
-	}
-	__create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
-			     early_pgtable_alloc, 0);
+	__pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
+		       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
+		       __phys_to_virt(ptep) - ptep);
 
 	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
 		extern u32 __idmap_kpti_flag;
@@ -782,8 +781,10 @@ static void __init create_idmap(void)
 		 * The KPTI G-to-nG conversion code needs a read-write mapping
 		 * of its synchronization flag in the ID map.
 		 */
-		__create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
-				     early_pgtable_alloc, 0);
+		ptep = __pa_symbol(kpti_ptes);
+		__pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
+			       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
+			       __phys_to_virt(ptep) - ptep);
 	}
 }
 
@@ -808,6 +809,7 @@ void __init paging_init(void)
 	memblock_allow_resize();
 
 	create_idmap();
+	idmap_t0sz = TCR_T0SZ(IDMAP_VA_BITS);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 7c1bdaf25408..47ede52bb900 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -421,9 +421,9 @@ SYM_FUNC_START(__cpu_setup)
 	mair	.req	x17
 	tcr	.req	x16
 	mov_q	mair, MAIR_EL1_SET
-	mov_q	tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
-			TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
-			TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
+	mov_q	tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS) | TCR_CACHE_FLAGS | \
+		     TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
+		     TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
 
 	tcr_clear_errata_bits tcr, x9, x5
 
@@ -431,10 +431,7 @@ SYM_FUNC_START(__cpu_setup)
 	sub		x9, xzr, x0
 	add		x9, x9, #64
 	tcr_set_t1sz	tcr, x9
-#else
-	idmap_get_t0sz	x9
 #endif
-	tcr_set_t0sz	tcr, x9
 
 	/*
 	 * Set the IPS bits in TCR_EL1.

From 34b98e55f6840cab938d480968c0f600a2ed97d5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:06 +0100
Subject: [PATCH 053/134] arm64: pgtable: Decouple PGDIR size macros from
 PGD/PUD/PMD levels

The mapping from PGD/PUD/PMD to levels and shifts is very confusing,
given that, due to folding, the shifts may be equal for different
levels, if the macros are even #define'd to begin with.

In a subsequent patch, we will modify the ID mapping code to decouple
the number of levels from the kernel's view of how these types are
folded, so prepare for this by reformulating the macros without the use
of these types.

Instead, use SWAPPER_BLOCK_SHIFT as the base quantity, and derive it
from either PAGE_SHIFT or PMD_SHIFT, which -if defined at all- are
defined unambiguously for a given page size, regardless of the number of
configured levels.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-65-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 65 ++++++++-----------------
 1 file changed, 19 insertions(+), 46 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 742a4b2778f7..f1fc98a233d5 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -13,27 +13,22 @@
 #include <asm/sparsemem.h>
 
 /*
- * The linear mapping and the start of memory are both 2M aligned (per
- * the arm64 booting.txt requirements). Hence we can use section mapping
- * with 4K (section size = 2M) but not with 16K (section size = 32M) or
- * 64K (section size = 512M).
+ * The physical and virtual addresses of the start of the kernel image are
+ * equal modulo 2 MiB (per the arm64 booting.txt requirements). Hence we can
+ * use section mapping with 4K (section size = 2M) but not with 16K (section
+ * size = 32M) or 64K (section size = 512M).
  */
-
-/*
- * The idmap and swapper page tables need some space reserved in the kernel
- * image. Both require pgd, pud (4 levels only) and pmd tables to (section)
- * map the kernel. With the 64K page configuration, swapper and idmap need to
- * map to pte level. The swapper also maps the FDT (see __create_page_tables
- * for more information). Note that the number of ID map translation levels
- * could be increased on the fly if system RAM is out of reach for the default
- * VA range, so pages required to map highest possible PA are reserved in all
- * cases.
- */
-#ifdef CONFIG_ARM64_4K_PAGES
-#define SWAPPER_PGTABLE_LEVELS	(CONFIG_PGTABLE_LEVELS - 1)
+#if defined(PMD_SIZE) && PMD_SIZE <= MIN_KIMG_ALIGN
+#define SWAPPER_BLOCK_SHIFT	PMD_SHIFT
+#define SWAPPER_SKIP_LEVEL	1
 #else
-#define SWAPPER_PGTABLE_LEVELS	(CONFIG_PGTABLE_LEVELS)
+#define SWAPPER_BLOCK_SHIFT	PAGE_SHIFT
+#define SWAPPER_SKIP_LEVEL	0
 #endif
+#define SWAPPER_BLOCK_SIZE	(UL(1) << SWAPPER_BLOCK_SHIFT)
+#define SWAPPER_TABLE_SHIFT	(SWAPPER_BLOCK_SHIFT + PAGE_SHIFT - 3)
+
+#define SWAPPER_PGTABLE_LEVELS		(CONFIG_PGTABLE_LEVELS - SWAPPER_SKIP_LEVEL)
 
 #define IDMAP_VA_BITS		48
 #define IDMAP_LEVELS		ARM64_HW_PGTABLE_LEVELS(IDMAP_VA_BITS)
@@ -53,24 +48,13 @@
 #define EARLY_ENTRIES(vstart, vend, shift, add) \
 	(SPAN_NR_ENTRIES(vstart, vend, shift) + (add))
 
-#define EARLY_PGDS(vstart, vend, add) (EARLY_ENTRIES(vstart, vend, PGDIR_SHIFT, add))
+#define EARLY_LEVEL(lvl, vstart, vend, add)	\
+	(SWAPPER_PGTABLE_LEVELS > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0)
 
-#if SWAPPER_PGTABLE_LEVELS > 3
-#define EARLY_PUDS(vstart, vend, add) (EARLY_ENTRIES(vstart, vend, PUD_SHIFT, add))
-#else
-#define EARLY_PUDS(vstart, vend, add) (0)
-#endif
-
-#if SWAPPER_PGTABLE_LEVELS > 2
-#define EARLY_PMDS(vstart, vend, add) (EARLY_ENTRIES(vstart, vend, SWAPPER_TABLE_SHIFT, add))
-#else
-#define EARLY_PMDS(vstart, vend, add) (0)
-#endif
-
-#define EARLY_PAGES(vstart, vend, add) ( 1 			/* PGDIR page */				\
-			+ EARLY_PGDS((vstart), (vend), add) 	/* each PGDIR needs a next level page table */	\
-			+ EARLY_PUDS((vstart), (vend), add)	/* each PUD needs a next level page table */	\
-			+ EARLY_PMDS((vstart), (vend), add))	/* each PMD needs a next level page table */
+#define EARLY_PAGES(vstart, vend, add) (1 	/* PGDIR page */				\
+	+ EARLY_LEVEL(3, (vstart), (vend), add) /* each entry needs a next level page table */	\
+	+ EARLY_LEVEL(2, (vstart), (vend), add)	/* each entry needs a next level page table */	\
+	+ EARLY_LEVEL(1, (vstart), (vend), add))/* each entry needs a next level page table */
 #define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(KIMAGE_VADDR, _end, EXTRA_PAGE) + EARLY_SEGMENT_EXTRA_PAGES))
 
 /* the initial ID map may need two extra pages if it needs to be extended */
@@ -81,17 +65,6 @@
 #endif
 #define INIT_IDMAP_DIR_PAGES	EARLY_PAGES(KIMAGE_VADDR, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE, 1)
 
-/* Initial memory map size */
-#ifdef CONFIG_ARM64_4K_PAGES
-#define SWAPPER_BLOCK_SHIFT	PMD_SHIFT
-#define SWAPPER_BLOCK_SIZE	PMD_SIZE
-#define SWAPPER_TABLE_SHIFT	PUD_SHIFT
-#else
-#define SWAPPER_BLOCK_SHIFT	PAGE_SHIFT
-#define SWAPPER_BLOCK_SIZE	PAGE_SIZE
-#define SWAPPER_TABLE_SHIFT	PMD_SHIFT
-#endif
-
 /* The number of segments in the kernel image (text, rodata, inittext, initdata, data+bss) */
 #define KERNEL_SEGMENT_COUNT	5
 

From 84b04d3e6bdbc7551e62b75dd97cae4a8bddb1b6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:07 +0100
Subject: [PATCH 054/134] arm64: kernel: Create initial ID map from C code

The asm code that creates the initial ID map is rather intricate and
hard to follow. This is problematic because it makes adding support for
things like LPA2 or WXN more difficult than necessary. Also, it is
parameterized like the rest of the MM code to run with a configurable
number of levels, which is rather pointless, given that all AArch64 CPUs
implement support for 48-bit virtual addressing, and that many systems
exist with DRAM located outside of the 39-bit addressable range, which
is the only smaller VA size that is widely used, and we need additional
tricks to make things work in that combination.

So let's bite the bullet, and rip out all the asm macros, and fiddly
code, and replace it with a C implementation based on the newly added
routines for creating the early kernel VA mappings. And while at it,
create the initial ID map based on 48-bit virtual addressing as well,
regardless of the number of configured levels for the kernel proper.

Note that this code may execute with the MMU and caches disabled, and is
therefore not permitted to make unaligned accesses. This shouldn't
generally happen in any case for the algorithm as implemented, but to be
sure, let's pass -mstrict-align to the compiler just in case.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-66-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h      |  14 --
 arch/arm64/include/asm/kernel-pgtable.h |  50 ++---
 arch/arm64/include/asm/mmu_context.h    |   6 +-
 arch/arm64/kernel/head.S                | 267 ++----------------------
 arch/arm64/kernel/image-vars.h          |   1 +
 arch/arm64/kernel/pi/Makefile           |   3 +
 arch/arm64/kernel/pi/map_kernel.c       |  18 ++
 arch/arm64/kernel/pi/map_range.c        |  12 ++
 arch/arm64/kernel/pi/pi.h               |   4 +
 arch/arm64/mm/mmu.c                     |   5 -
 arch/arm64/mm/proc.S                    |   3 +-
 11 files changed, 88 insertions(+), 295 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 513787e43329..6a467c694039 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -345,20 +345,6 @@ alternative_cb_end
 	bfi	\valreg, \t1sz, #TCR_T1SZ_OFFSET, #TCR_TxSZ_WIDTH
 	.endm
 
-/*
- * idmap_get_t0sz - get the T0SZ value needed to cover the ID map
- *
- * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
- * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
- * this number conveniently equals the number of leading zeroes in
- * the physical address of _end.
- */
-	.macro	idmap_get_t0sz, reg
-	adrp	\reg, _end
-	orr	\reg, \reg, #(1 << VA_BITS_MIN) - 1
-	clz	\reg, \reg
-	.endm
-
 /*
  * tcr_compute_pa_size - set TCR.(I)PS to the highest supported
  * ID_AA64MMFR0_EL1.PARange value
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index f1fc98a233d5..bf05a77873a4 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -29,6 +29,7 @@
 #define SWAPPER_TABLE_SHIFT	(SWAPPER_BLOCK_SHIFT + PAGE_SHIFT - 3)
 
 #define SWAPPER_PGTABLE_LEVELS		(CONFIG_PGTABLE_LEVELS - SWAPPER_SKIP_LEVEL)
+#define INIT_IDMAP_PGTABLE_LEVELS	(IDMAP_LEVELS - SWAPPER_SKIP_LEVEL)
 
 #define IDMAP_VA_BITS		48
 #define IDMAP_LEVELS		ARM64_HW_PGTABLE_LEVELS(IDMAP_VA_BITS)
@@ -48,44 +49,39 @@
 #define EARLY_ENTRIES(vstart, vend, shift, add) \
 	(SPAN_NR_ENTRIES(vstart, vend, shift) + (add))
 
-#define EARLY_LEVEL(lvl, vstart, vend, add)	\
-	(SWAPPER_PGTABLE_LEVELS > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0)
+#define EARLY_LEVEL(lvl, lvls, vstart, vend, add)	\
+	(lvls > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0)
 
-#define EARLY_PAGES(vstart, vend, add) (1 	/* PGDIR page */				\
-	+ EARLY_LEVEL(3, (vstart), (vend), add) /* each entry needs a next level page table */	\
-	+ EARLY_LEVEL(2, (vstart), (vend), add)	/* each entry needs a next level page table */	\
-	+ EARLY_LEVEL(1, (vstart), (vend), add))/* each entry needs a next level page table */
-#define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(KIMAGE_VADDR, _end, EXTRA_PAGE) + EARLY_SEGMENT_EXTRA_PAGES))
+#define EARLY_PAGES(lvls, vstart, vend, add) (1 	/* PGDIR page */				\
+	+ EARLY_LEVEL(3, (lvls), (vstart), (vend), add) /* each entry needs a next level page table */	\
+	+ EARLY_LEVEL(2, (lvls), (vstart), (vend), add)	/* each entry needs a next level page table */	\
+	+ EARLY_LEVEL(1, (lvls), (vstart), (vend), add))/* each entry needs a next level page table */
+#define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(SWAPPER_PGTABLE_LEVELS, KIMAGE_VADDR, _end, EXTRA_PAGE) \
+				    + EARLY_SEGMENT_EXTRA_PAGES))
 
-/* the initial ID map may need two extra pages if it needs to be extended */
-#if VA_BITS < 48
-#define INIT_IDMAP_DIR_SIZE	((INIT_IDMAP_DIR_PAGES + 2) * PAGE_SIZE)
-#else
-#define INIT_IDMAP_DIR_SIZE	(INIT_IDMAP_DIR_PAGES * PAGE_SIZE)
-#endif
-#define INIT_IDMAP_DIR_PAGES	EARLY_PAGES(KIMAGE_VADDR, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE, 1)
+#define INIT_IDMAP_DIR_PAGES	(EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, _end, 1))
+#define INIT_IDMAP_DIR_SIZE	((INIT_IDMAP_DIR_PAGES + EARLY_IDMAP_EXTRA_PAGES) * PAGE_SIZE)
+
+#define INIT_IDMAP_FDT_PAGES	(EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, 0UL, UL(MAX_FDT_SIZE), 1) - 1)
+#define INIT_IDMAP_FDT_SIZE	((INIT_IDMAP_FDT_PAGES + EARLY_IDMAP_EXTRA_FDT_PAGES) * PAGE_SIZE)
 
 /* The number of segments in the kernel image (text, rodata, inittext, initdata, data+bss) */
 #define KERNEL_SEGMENT_COUNT	5
 
 #if SWAPPER_BLOCK_SIZE > SEGMENT_ALIGN
 #define EARLY_SEGMENT_EXTRA_PAGES (KERNEL_SEGMENT_COUNT + 1)
-#else
-#define EARLY_SEGMENT_EXTRA_PAGES 0
-#endif
-
 /*
- * Initial memory map attributes.
+ * The initial ID map consists of the kernel image, mapped as two separate
+ * segments, and may appear misaligned wrt the swapper block size. This means
+ * we need 3 additional pages. The DT could straddle a swapper block boundary,
+ * so it may need 2.
  */
-#define SWAPPER_PTE_FLAGS	(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED | PTE_UXN)
-#define SWAPPER_PMD_FLAGS	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S | PTE_UXN)
-
-#ifdef CONFIG_ARM64_4K_PAGES
-#define SWAPPER_RW_MMUFLAGS	(PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS | PTE_WRITE)
-#define SWAPPER_RX_MMUFLAGS	(SWAPPER_RW_MMUFLAGS | PMD_SECT_RDONLY)
+#define EARLY_IDMAP_EXTRA_PAGES		3
+#define EARLY_IDMAP_EXTRA_FDT_PAGES	2
 #else
-#define SWAPPER_RW_MMUFLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS | PTE_WRITE)
-#define SWAPPER_RX_MMUFLAGS	(SWAPPER_RW_MMUFLAGS | PTE_RDONLY)
+#define EARLY_SEGMENT_EXTRA_PAGES	0
+#define EARLY_IDMAP_EXTRA_PAGES		0
+#define EARLY_IDMAP_EXTRA_FDT_PAGES	0
 #endif
 
 #endif	/* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 926fbbcecbe0..a8a89a0f2867 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -61,11 +61,9 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
 }
 
 /*
- * TCR.T0SZ value to use when the ID map is active. Usually equals
- * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in
- * physical memory, in which case it will be smaller.
+ * TCR.T0SZ value to use when the ID map is active.
  */
-extern int idmap_t0sz;
+#define idmap_t0sz	TCR_T0SZ(IDMAP_VA_BITS)
 
 /*
  * Ensure TCR.T0SZ is set to the provided value.
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index a1c29d64e875..545b5d8976f4 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -80,26 +80,42 @@
 	 *  x19        primary_entry() .. start_kernel()        whether we entered with the MMU on
 	 *  x20        primary_entry() .. __primary_switch()    CPU boot mode
 	 *  x21        primary_entry() .. start_kernel()        FDT pointer passed at boot in x0
-	 *  x22        create_idmap() .. start_kernel()         ID map VA of the DT blob
 	 *  x25        primary_entry() .. start_kernel()        supported VA size
-	 *  x28        create_idmap()                           callee preserved temp register
 	 */
 SYM_CODE_START(primary_entry)
 	bl	record_mmu_state
 	bl	preserve_boot_args
-	bl	create_idmap
+
+	adrp	x1, early_init_stack
+	mov	sp, x1
+	mov	x29, xzr
+	adrp	x0, init_idmap_pg_dir
+	bl	__pi_create_init_idmap
+
+	/*
+	 * If the page tables have been populated with non-cacheable
+	 * accesses (MMU disabled), invalidate those tables again to
+	 * remove any speculatively loaded cache lines.
+	 */
+	cbnz	x19, 0f
+	dmb     sy
+	mov	x1, x0				// end of used region
+	adrp    x0, init_idmap_pg_dir
+	adr_l	x2, dcache_inval_poc
+	blr	x2
+	b	1f
 
 	/*
 	 * If we entered with the MMU and caches on, clean the ID mapped part
 	 * of the primary boot code to the PoC so we can safely execute it with
 	 * the MMU off.
 	 */
-	cbz	x19, 0f
-	adrp	x0, __idmap_text_start
+0:	adrp	x0, __idmap_text_start
 	adr_l	x1, __idmap_text_end
 	adr_l	x2, dcache_clean_poc
 	blr	x2
-0:	mov	x0, x19
+
+1:	mov	x0, x19
 	bl	init_kernel_el			// w0=cpu_boot_mode
 	mov	x20, x0
 
@@ -175,238 +191,6 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
 	ret
 SYM_CODE_END(preserve_boot_args)
 
-/*
- * Macro to populate page table entries, these entries can be pointers to the next level
- * or last level entries pointing to physical memory.
- *
- *	tbl:	page table address
- *	rtbl:	pointer to page table or physical memory
- *	index:	start index to write
- *	eindex:	end index to write - [index, eindex] written to
- *	flags:	flags for pagetable entry to or in
- *	inc:	increment to rtbl between each entry
- *	tmp1:	temporary variable
- *
- * Preserves:	tbl, eindex, flags, inc
- * Corrupts:	index, tmp1
- * Returns:	rtbl
- */
-	.macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
-.Lpe\@:	phys_to_pte \tmp1, \rtbl
-	orr	\tmp1, \tmp1, \flags	// tmp1 = table entry
-	str	\tmp1, [\tbl, \index, lsl #3]
-	add	\rtbl, \rtbl, \inc	// rtbl = pa next level
-	add	\index, \index, #1
-	cmp	\index, \eindex
-	b.ls	.Lpe\@
-	.endm
-
-/*
- * Compute indices of table entries from virtual address range. If multiple entries
- * were needed in the previous page table level then the next page table level is assumed
- * to be composed of multiple pages. (This effectively scales the end index).
- *
- *	vstart:	virtual address of start of range
- *	vend:	virtual address of end of range - we map [vstart, vend]
- *	shift:	shift used to transform virtual address into index
- *	order:  #imm 2log(number of entries in page table)
- *	istart:	index in table corresponding to vstart
- *	iend:	index in table corresponding to vend
- *	count:	On entry: how many extra entries were required in previous level, scales
- *			  our end index.
- *		On exit: returns how many extra entries required for next page table level
- *
- * Preserves:	vstart, vend
- * Returns:	istart, iend, count
- */
-	.macro compute_indices, vstart, vend, shift, order, istart, iend, count
-	ubfx	\istart, \vstart, \shift, \order
-	ubfx	\iend, \vend, \shift, \order
-	add	\iend, \iend, \count, lsl \order
-	sub	\count, \iend, \istart
-	.endm
-
-/*
- * Map memory for specified virtual address range. Each level of page table needed supports
- * multiple entries. If a level requires n entries the next page table level is assumed to be
- * formed from n pages.
- *
- *	tbl:	location of page table
- *	rtbl:	address to be used for first level page table entry (typically tbl + PAGE_SIZE)
- *	vstart:	virtual address of start of range
- *	vend:	virtual address of end of range - we map [vstart, vend - 1]
- *	flags:	flags to use to map last level entries
- *	phys:	physical address corresponding to vstart - physical memory is contiguous
- *	order:  #imm 2log(number of entries in PGD table)
- *
- * If extra_shift is set, an extra level will be populated if the end address does
- * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range.
- *
- * Temporaries:	istart, iend, tmp, count, sv - these need to be different registers
- * Preserves:	vstart, flags
- * Corrupts:	tbl, rtbl, vend, istart, iend, tmp, count, sv
- */
-	.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift
-	sub \vend, \vend, #1
-	add \rtbl, \tbl, #PAGE_SIZE
-	mov \count, #0
-
-	.ifnb	\extra_shift
-	tst	\vend, #~((1 << (\extra_shift)) - 1)
-	b.eq	.L_\@
-	compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count
-	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
-	mov \tbl, \sv
-	.endif
-.L_\@:
-	compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count
-	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
-	mov \tbl, \sv
-
-#if SWAPPER_PGTABLE_LEVELS > 3
-	compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
-	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
-	mov \tbl, \sv
-#endif
-
-#if SWAPPER_PGTABLE_LEVELS > 2
-	compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
-	mov \sv, \rtbl
-	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
-	mov \tbl, \sv
-#endif
-
-	compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
-	bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1
-	populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
-	.endm
-
-/*
- * Remap a subregion created with the map_memory macro with modified attributes
- * or output address. The entire remapped region must have been covered in the
- * invocation of map_memory.
- *
- * x0: last level table address (returned in first argument to map_memory)
- * x1: start VA of the existing mapping
- * x2: start VA of the region to update
- * x3: end VA of the region to update (exclusive)
- * x4: start PA associated with the region to update
- * x5: attributes to set on the updated region
- * x6: order of the last level mappings
- */
-SYM_FUNC_START_LOCAL(remap_region)
-	sub	x3, x3, #1		// make end inclusive
-
-	// Get the index offset for the start of the last level table
-	lsr	x1, x1, x6
-	bfi	x1, xzr, #0, #PAGE_SHIFT - 3
-
-	// Derive the start and end indexes into the last level table
-	// associated with the provided region
-	lsr	x2, x2, x6
-	lsr	x3, x3, x6
-	sub	x2, x2, x1
-	sub	x3, x3, x1
-
-	mov	x1, #1
-	lsl	x6, x1, x6		// block size at this level
-
-	populate_entries x0, x4, x2, x3, x5, x6, x7
-	ret
-SYM_FUNC_END(remap_region)
-
-SYM_FUNC_START_LOCAL(create_idmap)
-	mov	x28, lr
-	/*
-	 * The ID map carries a 1:1 mapping of the physical address range
-	 * covered by the loaded image, which could be anywhere in DRAM. This
-	 * means that the required size of the VA (== PA) space is decided at
-	 * boot time, and could be more than the configured size of the VA
-	 * space for ordinary kernel and user space mappings.
-	 *
-	 * There are three cases to consider here:
-	 * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover
-	 *   the placement of the image. In this case, we configure one extra
-	 *   level of translation on the fly for the ID map only. (This case
-	 *   also covers 42-bit VA/52-bit PA on 64k pages).
-	 *
-	 * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can
-	 *   only happen when using 64k pages, in which case we need to extend
-	 *   the root level table rather than add a level. Note that we can
-	 *   treat this case as 'always extended' as long as we take care not
-	 *   to program an unsupported T0SZ value into the TCR register.
-	 *
-	 * - Combinations that would require two additional levels of
-	 *   translation are not supported, e.g., VA_BITS==36 on 16k pages, or
-	 *   VA_BITS==39/4k pages with 5-level paging, where the input address
-	 *   requires more than 47 or 48 bits, respectively.
-	 */
-#if (VA_BITS < 48)
-#define IDMAP_PGD_ORDER	(VA_BITS - PGDIR_SHIFT)
-#define EXTRA_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
-
-	/*
-	 * If VA_BITS < 48, we have to configure an additional table level.
-	 * First, we have to verify our assumption that the current value of
-	 * VA_BITS was chosen such that all translation levels are fully
-	 * utilised, and that lowering T0SZ will always result in an additional
-	 * translation level to be configured.
-	 */
-#if VA_BITS != EXTRA_SHIFT
-#error "Mismatch between VA_BITS and page size/number of translation levels"
-#endif
-#else
-#define IDMAP_PGD_ORDER	(PHYS_MASK_SHIFT - PGDIR_SHIFT)
-#define EXTRA_SHIFT
-	/*
-	 * If VA_BITS == 48, we don't have to configure an additional
-	 * translation level, but the top-level table has more entries.
-	 */
-#endif
-	adrp	x0, init_idmap_pg_dir
-	adrp	x3, _text
-	adrp	x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE
-	mov_q	x7, SWAPPER_RX_MMUFLAGS
-
-	map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT
-
-	/* Remap [.init].data, BSS and the kernel page tables r/w in the ID map */
-	adrp	x1, _text
-	adrp	x2, __initdata_begin
-	adrp	x3, _end
-	bic	x4, x2, #SWAPPER_BLOCK_SIZE - 1
-	mov_q	x5, SWAPPER_RW_MMUFLAGS
-	mov	x6, #SWAPPER_BLOCK_SHIFT
-	bl	remap_region
-
-	/* Remap the FDT after the kernel image */
-	adrp	x1, _text
-	adrp	x22, _end + SWAPPER_BLOCK_SIZE
-	bic	x2, x22, #SWAPPER_BLOCK_SIZE - 1
-	bfi	x22, x21, #0, #SWAPPER_BLOCK_SHIFT		// remapped FDT address
-	add	x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE
-	bic	x4, x21, #SWAPPER_BLOCK_SIZE - 1
-	mov_q	x5, SWAPPER_RW_MMUFLAGS
-	mov	x6, #SWAPPER_BLOCK_SHIFT
-	bl	remap_region
-
-	/*
-	 * Since the page tables have been populated with non-cacheable
-	 * accesses (MMU disabled), invalidate those tables again to
-	 * remove any speculatively loaded cache lines.
-	 */
-	cbnz	x19, 0f				// skip cache invalidation if MMU is on
-	dmb	sy
-
-	adrp	x0, init_idmap_pg_dir
-	adrp	x1, init_idmap_pg_end
-	bl	dcache_inval_poc
-0:	ret	x28
-SYM_FUNC_END(create_idmap)
-
 	/*
 	 * Initialize CPU registers with task-specific and cpu-specific context.
 	 *
@@ -729,11 +513,6 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
 SYM_FUNC_END(__no_granule_support)
 
 SYM_FUNC_START_LOCAL(__primary_switch)
-	mrs		x1, tcr_el1
-	mov		x2, #64 - VA_BITS
-	tcr_set_t0sz	x1, x2
-	msr		tcr_el1, x1
-
 	adrp	x1, reserved_pg_dir
 	adrp	x2, init_idmap_pg_dir
 	bl	__enable_mmu
@@ -742,7 +521,7 @@ SYM_FUNC_START_LOCAL(__primary_switch)
 	mov	sp, x1
 	mov	x29, xzr
 	mov	x0, x20				// pass the full boot status
-	mov	x1, x22				// pass the low FDT mapping
+	mov	x1, x21				// pass the FDT
 	bl	__pi_early_map_kernel		// Map and relocate the kernel
 
 	ldr	x8, =__primary_switched
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index e566b32f9c22..941a14c05184 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -52,6 +52,7 @@ PROVIDE(__pi_cavium_erratum_27456_cpus	= cavium_erratum_27456_cpus);
 PROVIDE(__pi__ctype			= _ctype);
 PROVIDE(__pi_memstart_offset_seed	= memstart_offset_seed);
 
+PROVIDE(__pi_init_idmap_pg_dir		= init_idmap_pg_dir);
 PROVIDE(__pi_init_pg_dir		= init_pg_dir);
 PROVIDE(__pi_init_pg_end		= init_pg_end);
 
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 8c2f80a46b93..4393b41f0b71 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -11,6 +11,9 @@ KBUILD_CFLAGS	:= $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
 		   -fno-asynchronous-unwind-tables -fno-unwind-tables \
 		   $(call cc-option,-fno-addrsig)
 
+# this code may run with the MMU off so disable unaligned accesses
+CFLAGS_map_range.o += -mstrict-align
+
 # remove SCS flags from all objects in this directory
 KBUILD_CFLAGS	:= $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS))
 # disable LTO
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index f206373b28b0..f86e878d366d 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -128,6 +128,22 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 	}
 }
 
+static void __init map_fdt(u64 fdt)
+{
+	static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE);
+	u64 efdt = fdt + MAX_FDT_SIZE;
+	u64 ptep = (u64)ptes;
+
+	/*
+	 * Map up to MAX_FDT_SIZE bytes, but avoid overlap with
+	 * the kernel image.
+	 */
+	map_range(&ptep, fdt, (u64)_text > fdt ? min((u64)_text, efdt) : efdt,
+		  fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL,
+		  (pte_t *)init_idmap_pg_dir, false, 0);
+	dsb(ishst);
+}
+
 asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 {
 	static char const chosen_str[] __initconst = "/chosen";
@@ -136,6 +152,8 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	int root_level = 4 - CONFIG_PGTABLE_LEVELS;
 	int chosen;
 
+	map_fdt((u64)fdt);
+
 	/* Clear BSS and the initial page tables */
 	memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start);
 
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index c31feda18f47..79e4f6a2efe1 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -86,3 +86,15 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
 		tbl++;
 	}
 }
+
+asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir)
+{
+	u64 ptep = (u64)pg_dir + PAGE_SIZE;
+
+	map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext,
+		  PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
+	map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin,
+		  PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
+
+	return ptep;
+}
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
index d307c58e9741..1ea282a5f96a 100644
--- a/arch/arm64/kernel/pi/pi.h
+++ b/arch/arm64/kernel/pi/pi.h
@@ -21,6 +21,8 @@ static inline void *prel64_to_pointer(const prel64_t *offset)
 
 extern bool dynamic_scs_is_enabled;
 
+extern pgd_t init_idmap_pg_dir[];
+
 void init_feature_override(u64 boot_status, const void *fdt, int chosen);
 u64 kaslr_early_init(void *fdt, int chosen);
 void relocate_kernel(u64 offset);
@@ -30,3 +32,5 @@ void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
 	       int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
 
 asmlinkage void early_map_kernel(u64 boot_status, void *fdt);
+
+asmlinkage u64 create_init_idmap(pgd_t *pgd);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a991f195592b..14a62c773201 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -45,8 +45,6 @@
 #define NO_CONT_MAPPINGS	BIT(1)
 #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
 
-int idmap_t0sz __ro_after_init;
-
 #if VA_BITS > 48
 u64 vabits_actual __ro_after_init = VA_BITS_MIN;
 EXPORT_SYMBOL(vabits_actual);
@@ -793,8 +791,6 @@ void __init paging_init(void)
 	pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
 	extern pgd_t init_idmap_pg_dir[];
 
-	idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0));
-
 	map_kernel(pgdp);
 	map_mem(pgdp);
 
@@ -809,7 +805,6 @@ void __init paging_init(void)
 	memblock_allow_resize();
 
 	create_idmap();
-	idmap_t0sz = TCR_T0SZ(IDMAP_VA_BITS);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 47ede52bb900..55c366dbda8f 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -200,7 +200,8 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 
-#define KPTI_NG_PTE_FLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS | PTE_WRITE)
+#define KPTI_NG_PTE_FLAGS	(PTE_ATTRINDX(MT_NORMAL) | PTE_TYPE_PAGE | \
+				 PTE_AF | PTE_SHARED | PTE_UXN | PTE_WRITE)
 
 	.pushsection ".idmap.text", "a"
 

From 567a70c181df72f3bb42ca825adb5de682713caa Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:08 +0100
Subject: [PATCH 055/134] arm64: mm: avoid fixmap for early swapper_pg_dir
 updates

Early in the boot, when .rodata is still writable, we can poke
swapper_pg_dir entries directly, and there is no need to go through the
fixmap. After a future patch, we will enter the kernel with
swapper_pg_dir already active, and early swapper_pg_dir updates for
creating the fixmap page table hierarchy itself cannot go through the
fixmap for obvious reaons. So let's keep track of whether rodata is
writable, and update the descriptor directly in that case.

As the same reasoning applies to early KASAN init, make the function
noinstr as well.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-67-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/mmu.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 14a62c773201..9758f7e3f4b6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -55,6 +55,8 @@ EXPORT_SYMBOL(kimage_voffset);
 
 u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
 
+static bool rodata_is_rw __ro_after_init = true;
+
 /*
  * The booting CPU updates the failed status @__early_cpu_boot_status,
  * with MMU turned off.
@@ -71,10 +73,21 @@ EXPORT_SYMBOL(empty_zero_page);
 static DEFINE_SPINLOCK(swapper_pgdir_lock);
 static DEFINE_MUTEX(fixmap_lock);
 
-void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
+void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 	pgd_t *fixmap_pgdp;
 
+	/*
+	 * Don't bother with the fixmap if swapper_pg_dir is still mapped
+	 * writable in the kernel mapping.
+	 */
+	if (rodata_is_rw) {
+		WRITE_ONCE(*pgdp, pgd);
+		dsb(ishst);
+		isb();
+		return;
+	}
+
 	spin_lock(&swapper_pgdir_lock);
 	fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
 	WRITE_ONCE(*fixmap_pgdp, pgd);
@@ -628,6 +641,7 @@ void mark_rodata_ro(void)
 	 * to cover NOTES and EXCEPTION_TABLE.
 	 */
 	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
+	WRITE_ONCE(rodata_is_rw, false);
 	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 			    section_size, PAGE_KERNEL_RO);
 

From ba5b0333a847ac026725122e085b2fea9e1674bc Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:09 +0100
Subject: [PATCH 056/134] arm64: mm: omit redundant remap of kernel image

Now that the early kernel mapping is created with all the right
attributes and segment boundaries, there is no longer a need to recreate
it and switch to it. This also means we no longer have to copy the kasan
shadow or some parts of the fixmap from one set of page tables to the
other.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-68-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fixmap.h   |  1 -
 arch/arm64/include/asm/kasan.h    |  2 -
 arch/arm64/include/asm/mmu.h      |  2 +-
 arch/arm64/kernel/image-vars.h    |  1 +
 arch/arm64/kernel/pi/map_kernel.c |  6 ++-
 arch/arm64/mm/fixmap.c            | 34 -------------
 arch/arm64/mm/kasan_init.c        | 15 ------
 arch/arm64/mm/mmu.c               | 85 +++++--------------------------
 8 files changed, 21 insertions(+), 125 deletions(-)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 58c294a96676..8aabd45e9a13 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -100,7 +100,6 @@ enum fixed_addresses {
 #define FIXMAP_PAGE_IO     __pgprot(PROT_DEVICE_nGnRE)
 
 void __init early_fixmap_init(void);
-void __init fixmap_copy(pgd_t *pgdir);
 
 #define __early_set_fixmap __set_fixmap
 
diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h
index 7eefc525a9df..e1b57c13f8a4 100644
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -17,11 +17,9 @@
 
 asmlinkage void kasan_early_init(void);
 void kasan_init(void);
-void kasan_copy_shadow(pgd_t *pgdir);
 
 #else
 static inline void kasan_init(void) { }
-static inline void kasan_copy_shadow(pgd_t *pgdir) { }
 #endif
 
 #endif
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index d0b8b4b413b6..65977c7783c5 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -110,7 +110,7 @@ static inline bool kaslr_requires_kpti(void)
 }
 
 #define INIT_MM_CONTEXT(name)	\
-	.pgd = init_pg_dir,
+	.pgd = swapper_pg_dir,
 
 #endif	/* !__ASSEMBLY__ */
 #endif
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 941a14c05184..e140c5bda90b 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -55,6 +55,7 @@ PROVIDE(__pi_memstart_offset_seed	= memstart_offset_seed);
 PROVIDE(__pi_init_idmap_pg_dir		= init_idmap_pg_dir);
 PROVIDE(__pi_init_pg_dir		= init_pg_dir);
 PROVIDE(__pi_init_pg_end		= init_pg_end);
+PROVIDE(__pi_swapper_pg_dir		= swapper_pg_dir);
 
 PROVIDE(__pi__text			= _text);
 PROVIDE(__pi__stext               	= _stext);
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index f86e878d366d..4b76a007a50d 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -124,8 +124,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 			    text_prot, true, root_level);
 		map_segment(init_pg_dir, NULL, va_offset, __inittext_begin,
 			    __inittext_end, text_prot, false, root_level);
-		dsb(ishst);
 	}
+
+	/* Copy the root page table to its final location */
+	memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PGD_SIZE);
+	dsb(ishst);
+	idmap_cpu_replace_ttbr1(swapper_pg_dir);
 }
 
 static void __init map_fdt(u64 fdt)
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index 6fc17b2e1714..9404f282f829 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -170,37 +170,3 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 
 	return dt_virt;
 }
-
-/*
- * Copy the fixmap region into a new pgdir.
- */
-void __init fixmap_copy(pgd_t *pgdir)
-{
-	if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdir, FIXADDR_TOT_START)))) {
-		/*
-		 * The fixmap falls in a separate pgd to the kernel, and doesn't
-		 * live in the carveout for the swapper_pg_dir. We can simply
-		 * re-use the existing dir for the fixmap.
-		 */
-		set_pgd(pgd_offset_pgd(pgdir, FIXADDR_TOT_START),
-			READ_ONCE(*pgd_offset_k(FIXADDR_TOT_START)));
-	} else if (CONFIG_PGTABLE_LEVELS > 3) {
-		pgd_t *bm_pgdp;
-		p4d_t *bm_p4dp;
-		pud_t *bm_pudp;
-		/*
-		 * The fixmap shares its top level pgd entry with the kernel
-		 * mapping. This can really only occur when we are running
-		 * with 16k/4 levels, so we can simply reuse the pud level
-		 * entry instead.
-		 */
-		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
-		bm_pgdp = pgd_offset_pgd(pgdir, FIXADDR_TOT_START);
-		bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_TOT_START);
-		bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_TOT_START);
-		pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
-		pud_clear_fixmap();
-	} else {
-		BUG();
-	}
-}
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 4c7ad574b946..89828ad2bca7 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -189,21 +189,6 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end,
 	kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
 }
 
-/*
- * Copy the current shadow region into a new pgdir.
- */
-void __init kasan_copy_shadow(pgd_t *pgdir)
-{
-	pgd_t *pgdp, *pgdp_new, *pgdp_end;
-
-	pgdp = pgd_offset_k(KASAN_SHADOW_START);
-	pgdp_end = pgd_offset_k(KASAN_SHADOW_END);
-	pgdp_new = pgd_offset_pgd(pgdir, KASAN_SHADOW_START);
-	do {
-		set_pgd(pgdp_new, READ_ONCE(*pgdp));
-	} while (pgdp++, pgdp_new++, pgdp != pgdp_end);
-}
-
 static void __init clear_pgds(unsigned long start,
 			unsigned long end)
 {
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9758f7e3f4b6..3db40b517947 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -648,9 +648,9 @@ void mark_rodata_ro(void)
 	debug_checkwx();
 }
 
-static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
-				      pgprot_t prot, struct vm_struct *vma,
-				      int flags, unsigned long vm_flags)
+static void __init declare_vma(struct vm_struct *vma,
+			       void *va_start, void *va_end,
+			       unsigned long vm_flags)
 {
 	phys_addr_t pa_start = __pa_symbol(va_start);
 	unsigned long size = va_end - va_start;
@@ -658,9 +658,6 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 	BUG_ON(!PAGE_ALIGNED(pa_start));
 	BUG_ON(!PAGE_ALIGNED(size));
 
-	__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
-			     early_pgtable_alloc, flags);
-
 	if (!(vm_flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
@@ -673,12 +670,12 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 	vm_area_add_early(vma);
 }
 
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 static pgprot_t kernel_exec_prot(void)
 {
 	return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 }
 
-#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 static int __init map_entry_trampoline(void)
 {
 	int i;
@@ -713,60 +710,17 @@ core_initcall(map_entry_trampoline);
 #endif
 
 /*
- * Open coded check for BTI, only for use to determine configuration
- * for early mappings for before the cpufeature code has run.
+ * Declare the VMA areas for the kernel
  */
-static bool arm64_early_this_cpu_has_bti(void)
+static void __init declare_kernel_vmas(void)
 {
-	u64 pfr1;
+	static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
 
-	if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
-		return false;
-
-	pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1);
-	return cpuid_feature_extract_unsigned_field(pfr1,
-						    ID_AA64PFR1_EL1_BT_SHIFT);
-}
-
-/*
- * Create fine-grained mappings for the kernel.
- */
-static void __init map_kernel(pgd_t *pgdp)
-{
-	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
-				vmlinux_initdata, vmlinux_data;
-
-	/*
-	 * External debuggers may need to write directly to the text
-	 * mapping to install SW breakpoints. Allow this (only) when
-	 * explicitly requested with rodata=off.
-	 */
-	pgprot_t text_prot = kernel_exec_prot();
-
-	/*
-	 * If we have a CPU that supports BTI and a kernel built for
-	 * BTI then mark the kernel executable text as guarded pages
-	 * now so we don't have to rewrite the page tables later.
-	 */
-	if (arm64_early_this_cpu_has_bti())
-		text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
-
-	/*
-	 * Only rodata will be remapped with different permissions later on,
-	 * all other segments are allowed to use contiguous mappings.
-	 */
-	map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
-			   VM_NO_GUARD);
-	map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
-			   &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
-	map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
-			   &vmlinux_inittext, 0, VM_NO_GUARD);
-	map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
-			   &vmlinux_initdata, 0, VM_NO_GUARD);
-	map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
-
-	fixmap_copy(pgdp);
-	kasan_copy_shadow(pgdp);
+	declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
+	declare_vma(&vmlinux_seg[4], _data, _end, 0);
 }
 
 void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
@@ -802,23 +756,12 @@ static void __init create_idmap(void)
 
 void __init paging_init(void)
 {
-	pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
-	extern pgd_t init_idmap_pg_dir[];
-
-	map_kernel(pgdp);
-	map_mem(pgdp);
-
-	pgd_clear_fixmap();
-
-	cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir);
-	init_mm.pgd = swapper_pg_dir;
-
-	memblock_phys_free(__pa_symbol(init_pg_dir),
-			   __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
+	map_mem(swapper_pg_dir);
 
 	memblock_allow_resize();
 
 	create_idmap();
+	declare_kernel_vmas();
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG

From e0f92f0d1b512cf11b918c5828e73d5df5b667cc Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:10 +0100
Subject: [PATCH 057/134] arm64: Revert "mm: provide idmap pointer to
 cpu_replace_ttbr1()"

This reverts commit 1682c45b920643c, which is no longer needed now that
we create the permanent kernel mapping directly during early boot.

This is a RINO (revert in name only) given that some of the code has
moved around, but the changes are straight-forward.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-69-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu_context.h | 17 ++++++-----------
 arch/arm64/mm/kasan_init.c           |  4 ++--
 arch/arm64/mm/mmu.c                  |  4 ++--
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a8a89a0f2867..c768d16b81a4 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -108,18 +108,13 @@ static inline void cpu_uninstall_idmap(void)
 		cpu_switch_mm(mm->pgd, mm);
 }
 
-static inline void __cpu_install_idmap(pgd_t *idmap)
+static inline void cpu_install_idmap(void)
 {
 	cpu_set_reserved_ttbr0();
 	local_flush_tlb_all();
 	cpu_set_idmap_tcr_t0sz();
 
-	cpu_switch_mm(lm_alias(idmap), &init_mm);
-}
-
-static inline void cpu_install_idmap(void)
-{
-	__cpu_install_idmap(idmap_pg_dir);
+	cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
 }
 
 /*
@@ -146,21 +141,21 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz)
 	isb();
 }
 
-void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp);
+void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp);
 
 static inline void cpu_enable_swapper_cnp(void)
 {
-	__cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir, true);
+	__cpu_replace_ttbr1(lm_alias(swapper_pg_dir), true);
 }
 
-static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
+static inline void cpu_replace_ttbr1(pgd_t *pgdp)
 {
 	/*
 	 * Only for early TTBR1 replacement before cpucaps are finalized and
 	 * before we've decided whether to use CNP.
 	 */
 	WARN_ON(system_capabilities_finalized());
-	__cpu_replace_ttbr1(pgdp, idmap, false);
+	__cpu_replace_ttbr1(pgdp, false);
 }
 
 /*
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 89828ad2bca7..a86ab99587c9 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -225,7 +225,7 @@ static void __init kasan_init_shadow(void)
 	 */
 	memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
 	dsb(ishst);
-	cpu_replace_ttbr1(lm_alias(tmp_pg_dir), idmap_pg_dir);
+	cpu_replace_ttbr1(lm_alias(tmp_pg_dir));
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
@@ -261,7 +261,7 @@ static void __init kasan_init_shadow(void)
 				PAGE_KERNEL_RO));
 
 	memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
-	cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir);
+	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 }
 
 static void __init kasan_init_depth(void)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3db40b517947..a3d23da92d87 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1445,7 +1445,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte
  * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
  * avoiding the possibility of conflicting TLB entries being allocated.
  */
-void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp)
+void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
 {
 	typedef void (ttbr_replace_func)(phys_addr_t);
 	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
@@ -1460,7 +1460,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp)
 
 	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
-	__cpu_install_idmap(idmap);
+	cpu_install_idmap();
 
 	/*
 	 * We really don't want to take *any* exceptions while TTBR1 is

From 9cce9c6c2c3b7d46698d9bb693389d37740fec28 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:11 +0100
Subject: [PATCH 058/134] arm64: mm: Handle LVA support as a CPU feature

Currently, we detect CPU support for 52-bit virtual addressing (LVA)
extremely early, before creating the kernel page tables or enabling the
MMU. We cannot override the feature this early, and so large virtual
addressing is always enabled on CPUs that implement support for it if
the software support for it was enabled at build time. It also means we
rely on non-trivial code in asm to deal with this feature.

Given that both the ID map and the TTBR1 mapping of the kernel image are
guaranteed to be 48-bit addressable, it is not actually necessary to
enable support this early, and instead, we can model it as a CPU
feature. That way, we can rely on code patching to get the correct
TCR.T1SZ values programmed on secondary boot and resume from suspend.

On the primary boot path, we simply enable the MMU with 48-bit virtual
addressing initially, and update TCR.T1SZ if LVA is supported from C
code, right before creating the kernel mapping. Given that TTBR1 still
points to reserved_pg_dir at this point, updating TCR.T1SZ should be
safe without the need for explicit TLB maintenance.

Since this gets rid of all accesses to the vabits_actual variable from
asm code that occurred before TCR.T1SZ had been programmed, we no longer
have a need for this variable, and we can replace it with a C expression
that produces the correct value directly, based on the value of TCR.T1SZ.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-70-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  9 +++++++++
 arch/arm64/include/asm/memory.h     | 13 ++++++++++++-
 arch/arm64/kernel/cpufeature.c      | 13 +++++++++++++
 arch/arm64/kernel/head.S            | 29 +++++++----------------------
 arch/arm64/kernel/image-vars.h      |  1 -
 arch/arm64/kernel/pi/map_kernel.c   |  3 +++
 arch/arm64/kernel/sleep.S           |  3 ---
 arch/arm64/mm/mmu.c                 |  5 -----
 arch/arm64/mm/proc.S                |  9 ++++-----
 arch/arm64/tools/cpucaps            |  1 +
 10 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index e3edae1825f3..4f4dc5496ee3 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -995,6 +995,15 @@ static inline bool cpu_has_pac(void)
 					    &id_aa64isar2_override);
 }
 
+static inline bool cpu_has_lva(void)
+{
+	u64 mmfr2;
+
+	mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
+	return cpuid_feature_extract_unsigned_field(mmfr2,
+						    ID_AA64MMFR2_EL1_VARange_SHIFT);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 60904a6c4b42..9680d7444b3b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -209,9 +209,20 @@
 #include <asm/boot.h>
 #include <asm/bug.h>
 #include <asm/sections.h>
+#include <asm/sysreg.h>
+
+static inline u64 __pure read_tcr(void)
+{
+	u64  tcr;
+
+	// read_sysreg() uses asm volatile, so avoid it here
+	asm("mrs %0, tcr_el1" : "=r"(tcr));
+	return tcr;
+}
 
 #if VA_BITS > 48
-extern u64			vabits_actual;
+// For reasons of #include hell, we can't use TCR_T1SZ_OFFSET/TCR_T1SZ_MASK here
+#define vabits_actual		(64 - ((read_tcr() >> 16) & 63))
 #else
 #define vabits_actual		((u64)VA_BITS)
 #endif
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 7064cf13f226..8eb8c7f7b317 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2692,6 +2692,19 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.matches = has_lpa2,
 	},
+#ifdef CONFIG_ARM64_VA_BITS_52
+	{
+		.desc = "52-bit Virtual Addressing (LVA)",
+		.capability = ARM64_HAS_VA52,
+		.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
+		.sys_reg = SYS_ID_AA64MMFR2_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_width = 4,
+		.field_pos = ID_AA64MMFR2_EL1_VARange_SHIFT,
+		.matches = has_cpuid_feature,
+		.min_field_value = ID_AA64MMFR2_EL1_VARange_52,
+	},
+#endif
 	{},
 };
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 545b5d8976f4..e25351addfd0 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -80,7 +80,6 @@
 	 *  x19        primary_entry() .. start_kernel()        whether we entered with the MMU on
 	 *  x20        primary_entry() .. __primary_switch()    CPU boot mode
 	 *  x21        primary_entry() .. start_kernel()        FDT pointer passed at boot in x0
-	 *  x25        primary_entry() .. start_kernel()        supported VA size
 	 */
 SYM_CODE_START(primary_entry)
 	bl	record_mmu_state
@@ -125,14 +124,6 @@ SYM_CODE_START(primary_entry)
 	 * On return, the CPU will be ready for the MMU to be turned on and
 	 * the TCR will have been set.
 	 */
-#if VA_BITS > 48
-	mrs_s	x0, SYS_ID_AA64MMFR2_EL1
-	tst	x0, ID_AA64MMFR2_EL1_VARange_MASK
-	mov	x0, #VA_BITS
-	mov	x25, #VA_BITS_MIN
-	csel	x25, x25, x0, eq
-	mov	x0, x25
-#endif
 	bl	__cpu_setup			// initialise processor
 	b	__primary_switch
 SYM_CODE_END(primary_entry)
@@ -242,11 +233,6 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	mov	x0, x20
 	bl	set_cpu_boot_mode_flag
 
-#if VA_BITS > 48
-	adr_l	x8, vabits_actual		// Set this early so KASAN early init
-	str	x25, [x8]			// ... observes the correct value
-	dc	civac, x8			// Make visible to booting secondaries
-#endif
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
 #endif
@@ -376,10 +362,13 @@ SYM_FUNC_START_LOCAL(secondary_startup)
 	 * Common entry point for secondary CPUs.
 	 */
 	mov	x20, x0				// preserve boot mode
+
+#ifdef CONFIG_ARM64_VA_BITS_52
+alternative_if ARM64_HAS_VA52
 	bl	__cpu_secondary_check52bitva
-#if VA_BITS > 48
-	ldr_l	x0, vabits_actual
+alternative_else_nop_endif
 #endif
+
 	bl	__cpu_setup			// initialise processor
 	adrp	x1, swapper_pg_dir
 	adrp	x2, idmap_pg_dir
@@ -482,12 +471,8 @@ SYM_FUNC_START(__enable_mmu)
 	ret
 SYM_FUNC_END(__enable_mmu)
 
+#ifdef CONFIG_ARM64_VA_BITS_52
 SYM_FUNC_START(__cpu_secondary_check52bitva)
-#if VA_BITS > 48
-	ldr_l	x0, vabits_actual
-	cmp	x0, #52
-	b.ne	2f
-
 	mrs_s	x0, SYS_ID_AA64MMFR2_EL1
 	and	x0, x0, ID_AA64MMFR2_EL1_VARange_MASK
 	cbnz	x0, 2f
@@ -498,9 +483,9 @@ SYM_FUNC_START(__cpu_secondary_check52bitva)
 	wfi
 	b	1b
 
-#endif
 2:	ret
 SYM_FUNC_END(__cpu_secondary_check52bitva)
+#endif
 
 SYM_FUNC_START_LOCAL(__no_granule_support)
 	/* Indicate that this CPU can't boot and is stuck in the kernel */
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index e140c5bda90b..2b9d702abe0f 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -36,7 +36,6 @@ PROVIDE(__pi___memcpy			= __pi_memcpy);
 PROVIDE(__pi___memmove			= __pi_memmove);
 PROVIDE(__pi___memset			= __pi_memset);
 
-PROVIDE(__pi_vabits_actual		= vabits_actual);
 PROVIDE(__pi_id_aa64isar1_override	= id_aa64isar1_override);
 PROVIDE(__pi_id_aa64isar2_override	= id_aa64isar2_override);
 PROVIDE(__pi_id_aa64mmfr1_override	= id_aa64mmfr1_override);
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index 4b76a007a50d..1853825aa29d 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -165,6 +165,9 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	chosen = fdt_path_offset(fdt, chosen_str);
 	init_feature_override(boot_status, fdt, chosen);
 
+	if (VA_BITS > VA_BITS_MIN && cpu_has_lva())
+		sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(VA_BITS));
+
 	/*
 	 * The virtual KASLR displacement modulo 2MiB is decided by the
 	 * physical placement of the image, as otherwise, we might not be able
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 2aa5129d8253..f093cdf71be1 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -102,9 +102,6 @@ SYM_CODE_START(cpu_resume)
 	mov	x0, xzr
 	bl	init_kernel_el
 	mov	x19, x0			// preserve boot mode
-#if VA_BITS > 48
-	ldr_l	x0, vabits_actual
-#endif
 	bl	__cpu_setup
 	/* enable the MMU early - so we can access sleep_save_stash by va */
 	adrp	x1, swapper_pg_dir
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a3d23da92d87..ba00d0205447 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -45,11 +45,6 @@
 #define NO_CONT_MAPPINGS	BIT(1)
 #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
 
-#if VA_BITS > 48
-u64 vabits_actual __ro_after_init = VA_BITS_MIN;
-EXPORT_SYMBOL(vabits_actual);
-#endif
-
 u64 kimage_voffset __ro_after_init;
 EXPORT_SYMBOL(kimage_voffset);
 
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 55c366dbda8f..d104ddab26a4 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -397,8 +397,6 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings)
  *
  *	Initialise the processor for turning the MMU on.
  *
- * Input:
- *	x0 - actual number of VA bits (ignored unless VA_BITS > 48)
  * Output:
  *	Return in x0 the value of the SCTLR_EL1 register.
  */
@@ -422,16 +420,17 @@ SYM_FUNC_START(__cpu_setup)
 	mair	.req	x17
 	tcr	.req	x16
 	mov_q	mair, MAIR_EL1_SET
-	mov_q	tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS) | TCR_CACHE_FLAGS | \
+	mov_q	tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \
 		     TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
 		     TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
 
 	tcr_clear_errata_bits tcr, x9, x5
 
 #ifdef CONFIG_ARM64_VA_BITS_52
-	sub		x9, xzr, x0
-	add		x9, x9, #64
+	mov		x9, #64 - VA_BITS
+alternative_if ARM64_HAS_VA52
 	tcr_set_t1sz	tcr, x9
+alternative_else_nop_endif
 #endif
 
 	/*
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index b912b1409fc0..b370d808b3ec 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -50,6 +50,7 @@ HAS_STAGE2_FWB
 HAS_TCR2
 HAS_TIDCP1
 HAS_TLB_RANGE
+HAS_VA52
 HAS_VIRT_HOST_EXTN
 HAS_WFXT
 HW_DBM

From 68aec33f8f5a87b0450159e5e141d2d6c9d76850 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:12 +0100
Subject: [PATCH 059/134] arm64: mm: Add feature override support for LVA

Add support for overriding the VARange field of the MMFR2 CPU ID
register. This permits the associated LVA feature to be overridden early
enough for the boot code that creates the kernel mapping to take it into
account.

Given that LPA2 implies LVA, disabling the latter should disable the
former as well. So override the ID_AA64MMFR0.TGran field of the current
page size as well if it advertises support for 52-bit addressing.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-71-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h    | 17 +++++++++------
 arch/arm64/include/asm/cpufeature.h   |  4 ++++
 arch/arm64/kernel/cpufeature.c        |  8 +++++--
 arch/arm64/kernel/image-vars.h        |  2 ++
 arch/arm64/kernel/pi/idreg-override.c | 31 +++++++++++++++++++++++++++
 5 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 6a467c694039..68a99b116256 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -576,18 +576,21 @@ alternative_endif
 	.endm
 
 /*
- * Offset ttbr1 to allow for 48-bit kernel VAs set with 52-bit PTRS_PER_PGD.
+ * If the kernel is built for 52-bit virtual addressing but the hardware only
+ * supports 48 bits, we cannot program the pgdir address into TTBR1 directly,
+ * but we have to add an offset so that the TTBR1 address corresponds with the
+ * pgdir entry that covers the lowest 48-bit addressable VA.
+ *
  * orr is used as it can cover the immediate value (and is idempotent).
- * In future this may be nop'ed out when dealing with 52-bit kernel VAs.
  * 	ttbr: Value of ttbr to set, modified.
  */
 	.macro	offset_ttbr1, ttbr, tmp
 #ifdef CONFIG_ARM64_VA_BITS_52
-	mrs_s	\tmp, SYS_ID_AA64MMFR2_EL1
-	and	\tmp, \tmp, #(0xf << ID_AA64MMFR2_EL1_VARange_SHIFT)
-	cbnz	\tmp, .Lskipoffs_\@
-	orr	\ttbr, \ttbr, #TTBR1_BADDR_4852_OFFSET
-.Lskipoffs_\@ :
+	mrs	\tmp, tcr_el1
+	and	\tmp, \tmp, #TCR_T1SZ_MASK
+	cmp	\tmp, #TCR_T1SZ(VA_BITS_MIN)
+	orr	\tmp, \ttbr, #TTBR1_BADDR_4852_OFFSET
+	csel	\ttbr, \tmp, \ttbr, eq
 #endif
 	.endm
 
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 4f4dc5496ee3..a2ac31aecdd9 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -906,7 +906,9 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
 s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur);
 struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id);
 
+extern struct arm64_ftr_override id_aa64mmfr0_override;
 extern struct arm64_ftr_override id_aa64mmfr1_override;
+extern struct arm64_ftr_override id_aa64mmfr2_override;
 extern struct arm64_ftr_override id_aa64pfr0_override;
 extern struct arm64_ftr_override id_aa64pfr1_override;
 extern struct arm64_ftr_override id_aa64zfr0_override;
@@ -1000,6 +1002,8 @@ static inline bool cpu_has_lva(void)
 	u64 mmfr2;
 
 	mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
+	mmfr2 &= ~id_aa64mmfr2_override.mask;
+	mmfr2 |= id_aa64mmfr2_override.val;
 	return cpuid_feature_extract_unsigned_field(mmfr2,
 						    ID_AA64MMFR2_EL1_VARange_SHIFT);
 }
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8eb8c7f7b317..ed9670d8360c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -655,7 +655,9 @@ static const struct arm64_ftr_bits ftr_raz[] = {
 #define ARM64_FTR_REG(id, table)		\
 	__ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override)
 
+struct arm64_ftr_override id_aa64mmfr0_override;
 struct arm64_ftr_override id_aa64mmfr1_override;
+struct arm64_ftr_override id_aa64mmfr2_override;
 struct arm64_ftr_override id_aa64pfr0_override;
 struct arm64_ftr_override id_aa64pfr1_override;
 struct arm64_ftr_override id_aa64zfr0_override;
@@ -719,10 +721,12 @@ static const struct __ftr_reg_entry {
 			       &id_aa64isar2_override),
 
 	/* Op1 = 0, CRn = 0, CRm = 7 */
-	ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
+	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0,
+			       &id_aa64mmfr0_override),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1,
 			       &id_aa64mmfr1_override),
-	ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
+	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2,
+			       &id_aa64mmfr2_override),
 	ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3),
 
 	/* Op1 = 1, CRn = 0, CRm = 0 */
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 2b9d702abe0f..ff81f809a240 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -38,7 +38,9 @@ PROVIDE(__pi___memset			= __pi_memset);
 
 PROVIDE(__pi_id_aa64isar1_override	= id_aa64isar1_override);
 PROVIDE(__pi_id_aa64isar2_override	= id_aa64isar2_override);
+PROVIDE(__pi_id_aa64mmfr0_override	= id_aa64mmfr0_override);
 PROVIDE(__pi_id_aa64mmfr1_override	= id_aa64mmfr1_override);
+PROVIDE(__pi_id_aa64mmfr2_override	= id_aa64mmfr2_override);
 PROVIDE(__pi_id_aa64pfr0_override	= id_aa64pfr0_override);
 PROVIDE(__pi_id_aa64pfr1_override	= id_aa64pfr1_override);
 PROVIDE(__pi_id_aa64smfr0_override	= id_aa64smfr0_override);
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index 1884bd936c0d..aad399796e81 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -59,6 +59,35 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = {
 	},
 };
 
+
+static bool __init mmfr2_varange_filter(u64 val)
+{
+	int __maybe_unused feat;
+
+	if (val)
+		return false;
+
+#ifdef CONFIG_ARM64_LPA2
+	feat = cpuid_feature_extract_signed_field(read_sysreg(id_aa64mmfr0_el1),
+						  ID_AA64MMFR0_EL1_TGRAN_SHIFT);
+	if (feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2) {
+		id_aa64mmfr0_override.val |=
+			(ID_AA64MMFR0_EL1_TGRAN_LPA2 - 1) << ID_AA64MMFR0_EL1_TGRAN_SHIFT;
+		id_aa64mmfr0_override.mask |= 0xfU << ID_AA64MMFR0_EL1_TGRAN_SHIFT;
+	}
+#endif
+	return true;
+}
+
+static const struct ftr_set_desc mmfr2 __prel64_initconst = {
+	.name		= "id_aa64mmfr2",
+	.override	= &id_aa64mmfr2_override,
+	.fields		= {
+		FIELD("varange", ID_AA64MMFR2_EL1_VARange_SHIFT, mmfr2_varange_filter),
+		{}
+	},
+};
+
 static bool __init pfr0_sve_filter(u64 val)
 {
 	/*
@@ -167,6 +196,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = {
 static const
 PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = {
 	{ &mmfr1	},
+	{ &mmfr2	},
 	{ &pfr0 	},
 	{ &pfr1 	},
 	{ &isar1	},
@@ -192,6 +222,7 @@ static const struct {
 	{ "arm64.nomte",		"id_aa64pfr1.mte=0" },
 	{ "nokaslr",			"arm64_sw.nokaslr=1" },
 	{ "rodata=off",			"arm64_sw.rodataoff=1" },
+	{ "arm64.nolva",		"id_aa64mmfr2.varange=0" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)

From 60d043c101769b4fd6f609b4a7b9b8ad1f867860 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:13 +0100
Subject: [PATCH 060/134] arm64: Avoid #define'ing PTE_MAYBE_NG to 0x0 for asm
 use

The PROT_* macros resolve to expressions that are only valid in C and
not in assembler, and so they are only usable from C code. Currently, we
make an exception for the permission indirection init code in proc.S,
which doesn't care about the bits that are conditionally set, and so we
just #define PTE_MAYBE_NG to 0x0 for any assembler file that includes
these definitions.

This is dodgy because this means that PROT_NORMAL and friends is
generally available in asm code, but defined in a way that deviates from
the definition that C code will observe, which might lead to hard to
diagnose issues down the road.

So instead, #define PTE_MAYBE_NG only in the place where the PIE
constants are evaluated, and #undef it again right after. This allows us
to drop the #define from pgtable-prot.h, and avoid the risk of deviating
definitions between asm and C.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-72-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-prot.h |  4 ----
 arch/arm64/mm/proc.S                  | 13 +++++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 483dbfa39c4c..63ced9ccec21 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -57,10 +57,6 @@
 #define _PAGE_READONLY_EXEC	(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN)
 #define _PAGE_EXECONLY		(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN)
 
-#ifdef __ASSEMBLY__
-#define PTE_MAYBE_NG	0
-#endif
-
 #ifndef __ASSEMBLY__
 
 #include <asm/cpufeature.h>
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index d104ddab26a4..6e1b2bc41a9f 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -456,11 +456,24 @@ alternative_else_nop_endif
 	ubfx	x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4
 	cbz	x1, .Lskip_indirection
 
+	/*
+	 * The PROT_* macros describing the various memory types may resolve to
+	 * C expressions if they include the PTE_MAYBE_* macros, and so they
+	 * can only be used from C code. The PIE_E* constants below are also
+	 * defined in terms of those macros, but will mask out those
+	 * PTE_MAYBE_* constants, whether they are set or not. So #define them
+	 * as 0x0 here so we can evaluate the PIE_E* constants in asm context.
+	 */
+
+#define PTE_MAYBE_NG		0
+
 	mov_q	x0, PIE_E0
 	msr	REG_PIRE0_EL1, x0
 	mov_q	x0, PIE_E1
 	msr	REG_PIR_EL1, x0
 
+#undef PTE_MAYBE_NG
+
 	mov	x0, TCR2_EL1x_PIE
 	msr	REG_TCR2_EL1, x0
 

From 7ac8d5b2423cc0112ac2519276610865142a577b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:14 +0100
Subject: [PATCH 061/134] arm64: Add ESR decoding for exceptions involving
 translation level -1

The LPA2 feature introduces new FSC values to report abort exceptions
related to translation level -1. Define these and wire them up.

Reuse the new ESR FSC classification helpers that arrived via the KVM
arm64 tree, and update the one for translation faults to check
specifically for a translation fault at level -1. (Access flag or
permission faults cannot occur at level -1 because they alway involve a
descriptor at the superior level so changing those helpers is not
needed).

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-73-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/esr.h         | 13 +++++-------
 arch/arm64/include/asm/kvm_emulate.h | 10 ++--------
 arch/arm64/mm/fault.c                | 30 ++++++++++------------------
 3 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 353fe08546cf..81606bf7d5ac 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -117,15 +117,9 @@
 #define ESR_ELx_FSC_ACCESS	(0x08)
 #define ESR_ELx_FSC_FAULT	(0x04)
 #define ESR_ELx_FSC_PERM	(0x0C)
-#define ESR_ELx_FSC_SEA_TTW0	(0x14)
-#define ESR_ELx_FSC_SEA_TTW1	(0x15)
-#define ESR_ELx_FSC_SEA_TTW2	(0x16)
-#define ESR_ELx_FSC_SEA_TTW3	(0x17)
+#define ESR_ELx_FSC_SEA_TTW(n)	(0x14 + (n))
 #define ESR_ELx_FSC_SECC	(0x18)
-#define ESR_ELx_FSC_SECC_TTW0	(0x1c)
-#define ESR_ELx_FSC_SECC_TTW1	(0x1d)
-#define ESR_ELx_FSC_SECC_TTW2	(0x1e)
-#define ESR_ELx_FSC_SECC_TTW3	(0x1f)
+#define ESR_ELx_FSC_SECC_TTW(n)	(0x1c + (n))
 
 /* ISS field definitions for Data Aborts */
 #define ESR_ELx_ISV_SHIFT	(24)
@@ -394,6 +388,9 @@ static inline bool esr_is_data_abort(unsigned long esr)
 
 static inline bool esr_fsc_is_translation_fault(unsigned long esr)
 {
+	/* Translation fault, level -1 */
+	if ((esr & ESR_ELx_FSC) == 0b101011)
+		return true;
 	return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
 }
 
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index b804fe832184..6f5b41c70103 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -425,15 +425,9 @@ static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
 {
 	switch (kvm_vcpu_trap_get_fault(vcpu)) {
 	case ESR_ELx_FSC_EXTABT:
-	case ESR_ELx_FSC_SEA_TTW0:
-	case ESR_ELx_FSC_SEA_TTW1:
-	case ESR_ELx_FSC_SEA_TTW2:
-	case ESR_ELx_FSC_SEA_TTW3:
+	case ESR_ELx_FSC_SEA_TTW(-1) ... ESR_ELx_FSC_SEA_TTW(3):
 	case ESR_ELx_FSC_SECC:
-	case ESR_ELx_FSC_SECC_TTW0:
-	case ESR_ELx_FSC_SECC_TTW1:
-	case ESR_ELx_FSC_SECC_TTW2:
-	case ESR_ELx_FSC_SECC_TTW3:
+	case ESR_ELx_FSC_SECC_TTW(-1) ... ESR_ELx_FSC_SECC_TTW(3):
 		return true;
 	default:
 		return false;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 55f6455a8284..60265ede48fe 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -257,16 +257,14 @@ static bool is_el1_data_abort(unsigned long esr)
 static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
 					   struct pt_regs *regs)
 {
-	unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE;
-
 	if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
 		return false;
 
-	if (fsc_type == ESR_ELx_FSC_PERM)
+	if (esr_fsc_is_permission_fault(esr))
 		return true;
 
 	if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
-		return fsc_type == ESR_ELx_FSC_FAULT &&
+		return esr_fsc_is_translation_fault(esr) &&
 			(regs->pstate & PSR_PAN_BIT);
 
 	return false;
@@ -279,8 +277,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 	unsigned long flags;
 	u64 par, dfsc;
 
-	if (!is_el1_data_abort(esr) ||
-	    (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
+	if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr))
 		return false;
 
 	local_irq_save(flags);
@@ -301,7 +298,7 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 	 * treat the translation fault as spurious.
 	 */
 	dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
-	return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
+	return !esr_fsc_is_translation_fault(dfsc);
 }
 
 static void die_kernel_fault(const char *msg, unsigned long addr,
@@ -368,11 +365,6 @@ static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
 	return false;
 }
 
-static bool is_translation_fault(unsigned long esr)
-{
-	return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
-}
-
 static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 			      struct pt_regs *regs)
 {
@@ -405,7 +397,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (is_translation_fault(esr) &&
+		if (esr_fsc_is_translation_fault(esr) &&
 		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
 			return;
 
@@ -782,18 +774,18 @@ static const struct fault_info fault_info[] = {
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	},
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	},
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 8"			},
+	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 0 access flag fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 access flag fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 access flag fault"	},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 12"			},
+	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 0 permission fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 permission fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 permission fault"	},
 	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 permission fault"	},
 	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous external abort"	},
 	{ do_tag_check_fault,	SIGSEGV, SEGV_MTESERR,	"synchronous tag check fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 18"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 19"			},
+	{ do_sea,		SIGKILL, SI_KERNEL,	"level -1 (translation table walk)"	},
 	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 (translation table walk)"	},
 	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 (translation table walk)"	},
 	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 (translation table walk)"	},
@@ -801,7 +793,7 @@ static const struct fault_info fault_info[] = {
 	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous parity or ECC error" },	// Reserved when RAS is implemented
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 25"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 26"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 27"			},
+	{ do_sea,		SIGKILL, SI_KERNEL,	"level -1 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
@@ -815,9 +807,9 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 38"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 39"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 40"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 41"			},
+	{ do_bad,		SIGKILL, SI_KERNEL,	"level -1 address size fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 42"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 43"			},
+	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level -1 translation fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 44"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 45"			},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 46"			},

From db95ea787bd19be666ba41733259ffea65963bff Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:15 +0100
Subject: [PATCH 062/134] arm64: mm: Wire up TCR.DS bit to PTE shareability
 fields

When LPA2 is enabled, bits 8 and 9 of page and block descriptors become
part of the output address instead of carrying shareability attributes
for the region in question.

So avoid setting these bits if TCR.DS == 1, which means LPA2 is enabled.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-74-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig                     |  4 ++++
 arch/arm64/include/asm/pgtable-hwdef.h |  1 +
 arch/arm64/include/asm/pgtable-prot.h  | 16 ++++++++++++++--
 arch/arm64/mm/mmap.c                   |  4 ++++
 arch/arm64/mm/proc.S                   |  2 ++
 5 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d435139..8c2c36fffcf5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1377,6 +1377,10 @@ config ARM64_PA_BITS
 	default 48 if ARM64_PA_BITS_48
 	default 52 if ARM64_PA_BITS_52
 
+config ARM64_LPA2
+	def_bool y
+	depends on ARM64_PA_BITS_52 && !ARM64_64K_PAGES
+
 choice
 	prompt "Endianness"
 	default CPU_LITTLE_ENDIAN
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index e4944d517c99..b770f98fc0b5 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -284,6 +284,7 @@
 #define TCR_E0PD1		(UL(1) << 56)
 #define TCR_TCMA0		(UL(1) << 57)
 #define TCR_TCMA1		(UL(1) << 58)
+#define TCR_DS			(UL(1) << 59)
 
 /*
  * TTBR.
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 63ced9ccec21..dd9ee67d1d87 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -30,8 +30,8 @@
 #define _PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
 #define _PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
 
-#define PROT_DEFAULT		(_PROT_DEFAULT | PTE_MAYBE_NG)
-#define PROT_SECT_DEFAULT	(_PROT_SECT_DEFAULT | PMD_MAYBE_NG)
+#define PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF)
+#define PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF)
 
 #define PROT_DEVICE_nGnRnE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
 #define PROT_DEVICE_nGnRE	(PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
@@ -67,7 +67,19 @@ extern bool arm64_use_ng_mappings;
 #define PTE_MAYBE_NG		(arm64_use_ng_mappings ? PTE_NG : 0)
 #define PMD_MAYBE_NG		(arm64_use_ng_mappings ? PMD_SECT_NG : 0)
 
+#ifndef CONFIG_ARM64_LPA2
 #define lpa2_is_enabled()	false
+#define PTE_MAYBE_SHARED	PTE_SHARED
+#define PMD_MAYBE_SHARED	PMD_SECT_S
+#else
+static inline bool __pure lpa2_is_enabled(void)
+{
+	return read_tcr() & TCR_DS;
+}
+
+#define PTE_MAYBE_SHARED	(lpa2_is_enabled() ? 0 : PTE_SHARED)
+#define PMD_MAYBE_SHARED	(lpa2_is_enabled() ? 0 : PMD_SECT_S)
+#endif
 
 /*
  * If we have userspace only BTI we don't want to mark kernel pages
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 645fe60d000f..642bdf908b22 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -73,6 +73,10 @@ static int __init adjust_protection_map(void)
 		protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY;
 	}
 
+	if (lpa2_is_enabled())
+		for (int i = 0; i < ARRAY_SIZE(protection_map); i++)
+			pgprot_val(protection_map[i]) &= ~PTE_SHARED;
+
 	return 0;
 }
 arch_initcall(adjust_protection_map);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 6e1b2bc41a9f..7c46f8cfd6ae 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -466,6 +466,7 @@ alternative_else_nop_endif
 	 */
 
 #define PTE_MAYBE_NG		0
+#define PTE_MAYBE_SHARED	0
 
 	mov_q	x0, PIE_E0
 	msr	REG_PIRE0_EL1, x0
@@ -473,6 +474,7 @@ alternative_else_nop_endif
 	msr	REG_PIR_EL1, x0
 
 #undef PTE_MAYBE_NG
+#undef PTE_MAYBE_SHARED
 
 	mov	x0, TCR2_EL1x_PIE
 	msr	REG_TCR2_EL1, x0

From 925a0eb48044bf3d48531703c3b7522e1a8c87fb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:16 +0100
Subject: [PATCH 063/134] arm64: mm: Add LPA2 support to phys<->pte conversion
 routines

In preparation for enabling LPA2 support, introduce the mask values for
converting between physical addresses and their representations in a
page table descriptor.

While at it, move the pte_to_phys asm macro into its only user, so that
we can freely modify it to use its input value register as a temp
register.

For LPA2, the PTE_ADDR_MASK contains two non-adjacent sequences of zero
bits, which means it no longer fits into the immediate field of an
ordinary ALU instruction. So let's redefine it to include the bits in
between as well, and only use it when converting from physical address
to PTE representation, where the distinction does not matter. Also
update the name accordingly to emphasize this.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-75-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h     | 16 ++--------------
 arch/arm64/include/asm/pgtable-hwdef.h | 10 +++++++---
 arch/arm64/include/asm/pgtable.h       |  5 +++--
 arch/arm64/mm/proc.S                   |  8 ++++++++
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 68a99b116256..7eedcb36ebe0 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -612,25 +612,13 @@ alternative_endif
 
 	.macro	phys_to_pte, pte, phys
 #ifdef CONFIG_ARM64_PA_BITS_52
-	/*
-	 * We assume \phys is 64K aligned and this is guaranteed by only
-	 * supporting this configuration with 64K pages.
-	 */
-	orr	\pte, \phys, \phys, lsr #36
-	and	\pte, \pte, #PTE_ADDR_MASK
+	orr	\pte, \phys, \phys, lsr #PTE_ADDR_HIGH_SHIFT
+	and	\pte, \pte, #PHYS_TO_PTE_ADDR_MASK
 #else
 	mov	\pte, \phys
 #endif
 	.endm
 
-	.macro	pte_to_phys, phys, pte
-	and	\phys, \pte, #PTE_ADDR_MASK
-#ifdef CONFIG_ARM64_PA_BITS_52
-	orr	\phys, \phys, \phys, lsl #PTE_ADDR_HIGH_SHIFT
-	and	\phys, \phys, GENMASK_ULL(PHYS_MASK_SHIFT - 1, PAGE_SHIFT)
-#endif
-	.endm
-
 /*
  * tcr_clear_errata_bits - Clear TCR bits that trigger an errata on this CPU.
  */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index b770f98fc0b5..4426f48f2ae0 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -155,13 +155,17 @@
 #define PTE_PXN			(_AT(pteval_t, 1) << 53)	/* Privileged XN */
 #define PTE_UXN			(_AT(pteval_t, 1) << 54)	/* User XN */
 
-#define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
+#define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (50 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
 #ifdef CONFIG_ARM64_PA_BITS_52
+#ifdef CONFIG_ARM64_64K_PAGES
 #define PTE_ADDR_HIGH		(_AT(pteval_t, 0xf) << 12)
-#define PTE_ADDR_MASK		(PTE_ADDR_LOW | PTE_ADDR_HIGH)
 #define PTE_ADDR_HIGH_SHIFT	36
+#define PHYS_TO_PTE_ADDR_MASK	(PTE_ADDR_LOW | PTE_ADDR_HIGH)
 #else
-#define PTE_ADDR_MASK		PTE_ADDR_LOW
+#define PTE_ADDR_HIGH		(_AT(pteval_t, 0x3) << 8)
+#define PTE_ADDR_HIGH_SHIFT	42
+#define PHYS_TO_PTE_ADDR_MASK	GENMASK_ULL(49, 8)
+#endif
 #endif
 
 /*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 522c21348ae8..61de7b1516bc 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -80,15 +80,16 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 #ifdef CONFIG_ARM64_PA_BITS_52
 static inline phys_addr_t __pte_to_phys(pte_t pte)
 {
+	pte_val(pte) &= ~PTE_MAYBE_SHARED;
 	return (pte_val(pte) & PTE_ADDR_LOW) |
 		((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT);
 }
 static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 {
-	return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PTE_ADDR_MASK;
+	return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK;
 }
 #else
-#define __pte_to_phys(pte)	(pte_val(pte) & PTE_ADDR_MASK)
+#define __pte_to_phys(pte)	(pte_val(pte) & PTE_ADDR_LOW)
 #define __phys_to_pte_val(phys)	(phys)
 #endif
 
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 7c46f8cfd6ae..d03434b7bca5 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -205,6 +205,14 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
 
 	.pushsection ".idmap.text", "a"
 
+	.macro	pte_to_phys, phys, pte
+	and	\phys, \pte, #PTE_ADDR_LOW
+#ifdef CONFIG_ARM64_PA_BITS_52
+	and	\pte, \pte, #PTE_ADDR_HIGH
+	orr	\phys, \phys, \pte, lsl #PTE_ADDR_HIGH_SHIFT
+#endif
+	.endm
+
 	.macro	kpti_mk_tbl_ng, type, num_entries
 	add	end_\type\()p, cur_\type\()p, #\num_entries * 8
 .Ldo_\type:

From a6bbf5d4d9d13509fd068de664238c16934962c6 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:17 +0100
Subject: [PATCH 064/134] arm64: mm: Add definitions to support 5 levels of
 paging

Add the required types and descriptor accessors to support 5 levels of
paging in the common code. This is one of the prerequisites for
supporting 52-bit virtual addressing with 4k pages.

Note that this does not cover the code that handles kernel mappings or
the fixmap.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-76-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgalloc.h       | 41 +++++++++++++
 arch/arm64/include/asm/pgtable-hwdef.h | 22 ++++++-
 arch/arm64/include/asm/pgtable-types.h |  6 ++
 arch/arm64/include/asm/pgtable.h       | 82 +++++++++++++++++++++++++-
 arch/arm64/mm/mmu.c                    | 31 +++++++++-
 arch/arm64/mm/pgd.c                    | 15 ++++-
 6 files changed, 188 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 237224484d0f..cae8c648f462 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -60,6 +60,47 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 }
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
 
+#if CONFIG_PGTABLE_LEVELS > 4
+
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
+{
+	if (pgtable_l5_enabled())
+		set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
+{
+	pgdval_t pgdval = PGD_TYPE_TABLE;
+
+	pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
+	__pgd_populate(pgdp, __pa(p4dp), pgdval);
+}
+
+static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	gfp_t gfp = GFP_PGTABLE_USER;
+
+	if (mm == &init_mm)
+		gfp = GFP_PGTABLE_KERNEL;
+	return (p4d_t *)get_zeroed_page(gfp);
+}
+
+static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
+{
+	if (!pgtable_l5_enabled())
+		return;
+	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
+	free_page((unsigned long)p4d);
+}
+
+#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
+#else
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
+{
+	BUILD_BUG();
+}
+#endif	/* CONFIG_PGTABLE_LEVELS > 4 */
+
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
 
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 4426f48f2ae0..ef207a0d4f0d 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -26,10 +26,10 @@
 #define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3))
 
 /*
- * Size mapped by an entry at level n ( 0 <= n <= 3)
+ * Size mapped by an entry at level n ( -1 <= n <= 3)
  * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
  * in the final page. The maximum number of translation levels supported by
- * the architecture is 4. Hence, starting at level n, we have further
+ * the architecture is 5. Hence, starting at level n, we have further
  * ((4 - n) - 1) levels of translation excluding the offset within the page.
  * So, the total number of bits mapped by an entry at level n is :
  *
@@ -62,9 +62,16 @@
 #define PTRS_PER_PUD		(1 << (PAGE_SHIFT - 3))
 #endif
 
+#if CONFIG_PGTABLE_LEVELS > 4
+#define P4D_SHIFT		ARM64_HW_PGTABLE_LEVEL_SHIFT(0)
+#define P4D_SIZE		(_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK		(~(P4D_SIZE-1))
+#define PTRS_PER_P4D		(1 << (PAGE_SHIFT - 3))
+#endif
+
 /*
  * PGDIR_SHIFT determines the size a top-level page table entry can map
- * (depending on the configuration, this level can be 0, 1 or 2).
+ * (depending on the configuration, this level can be -1, 0, 1 or 2).
  */
 #define PGDIR_SHIFT		ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - CONFIG_PGTABLE_LEVELS)
 #define PGDIR_SIZE		(_AC(1, UL) << PGDIR_SHIFT)
@@ -87,6 +94,15 @@
 /*
  * Hardware page table definitions.
  *
+ * Level -1 descriptor (PGD).
+ */
+#define PGD_TYPE_TABLE		(_AT(pgdval_t, 3) << 0)
+#define PGD_TABLE_BIT		(_AT(pgdval_t, 1) << 1)
+#define PGD_TYPE_MASK		(_AT(pgdval_t, 3) << 0)
+#define PGD_TABLE_PXN		(_AT(pgdval_t, 1) << 59)
+#define PGD_TABLE_UXN		(_AT(pgdval_t, 1) << 60)
+
+/*
  * Level 0 descriptor (P4D).
  */
 #define P4D_TYPE_TABLE		(_AT(p4dval_t, 3) << 0)
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index b8f158ae2527..6d6d4065b0cb 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -36,6 +36,12 @@ typedef struct { pudval_t pud; } pud_t;
 #define __pud(x)	((pud_t) { (x) } )
 #endif
 
+#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
+#define p4d_val(x)	((x).p4d)
+#define __p4d(x)	((p4d_t) { (x) } )
+#endif
+
 typedef struct { pgdval_t pgd; } pgd_t;
 #define pgd_val(x)	((x).pgd)
 #define __pgd(x)	((pgd_t) { (x) } )
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 61de7b1516bc..7eb2b933ed3c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -808,7 +808,6 @@ static inline pud_t *p4d_pgtable(p4d_t p4d)
 #else
 
 #define p4d_page_paddr(p4d)	({ BUILD_BUG(); 0;})
-#define pgd_page_paddr(pgd)	({ BUILD_BUG(); 0;})
 
 /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
 #define pud_set_fixmap(addr)		NULL
@@ -819,6 +818,87 @@ static inline pud_t *p4d_pgtable(p4d_t p4d)
 
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
+#if CONFIG_PGTABLE_LEVELS > 4
+
+static __always_inline bool pgtable_l5_enabled(void)
+{
+	if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT))
+		return vabits_actual == VA_BITS;
+	return alternative_has_cap_unlikely(ARM64_HAS_VA52);
+}
+
+static inline bool mm_p4d_folded(const struct mm_struct *mm)
+{
+	return !pgtable_l5_enabled();
+}
+#define mm_p4d_folded  mm_p4d_folded
+
+#define p4d_ERROR(e)	\
+	pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e))
+
+#define pgd_none(pgd)		(pgtable_l5_enabled() && !pgd_val(pgd))
+#define pgd_bad(pgd)		(pgtable_l5_enabled() && !(pgd_val(pgd) & 2))
+#define pgd_present(pgd)	(!pgd_none(pgd))
+
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	if (in_swapper_pgdir(pgdp)) {
+		set_swapper_pgd(pgdp, __pgd(pgd_val(pgd)));
+		return;
+	}
+
+	WRITE_ONCE(*pgdp, pgd);
+	dsb(ishst);
+	isb();
+}
+
+static inline void pgd_clear(pgd_t *pgdp)
+{
+	if (pgtable_l5_enabled())
+		set_pgd(pgdp, __pgd(0));
+}
+
+static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
+{
+	return __pgd_to_phys(pgd);
+}
+
+#define p4d_index(addr)		(((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))
+
+static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr)
+{
+	return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr);
+}
+
+static inline phys_addr_t p4d_offset_phys(pgd_t *pgdp, unsigned long addr)
+{
+	BUG_ON(!pgtable_l5_enabled());
+
+	return pgd_page_paddr(READ_ONCE(*pgdp)) + p4d_index(addr) * sizeof(p4d_t);
+}
+
+static inline
+p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long addr)
+{
+	if (!pgtable_l5_enabled())
+		return pgd_to_folded_p4d(pgdp, addr);
+	return (p4d_t *)__va(pgd_page_paddr(pgd)) + p4d_index(addr);
+}
+#define p4d_offset_lockless p4d_offset_lockless
+
+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr)
+{
+	return p4d_offset_lockless(pgdp, READ_ONCE(*pgdp), addr);
+}
+
+#define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))
+
+#else
+
+static inline bool pgtable_l5_enabled(void) { return false; }
+
+#endif  /* CONFIG_PGTABLE_LEVELS > 4 */
+
 #define pgd_ERROR(e)	\
 	pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ba00d0205447..d2e9dec38a15 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1025,7 +1025,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
 	if (CONFIG_PGTABLE_LEVELS <= 3)
 		return;
 
-	if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+	if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK))
 		return;
 
 	/*
@@ -1048,8 +1048,8 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
 				 unsigned long end, unsigned long floor,
 				 unsigned long ceiling)
 {
-	unsigned long next;
 	p4d_t *p4dp, p4d;
+	unsigned long i, next, start = addr;
 
 	do {
 		next = p4d_addr_end(addr, end);
@@ -1061,6 +1061,27 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
 		WARN_ON(!p4d_present(p4d));
 		free_empty_pud_table(p4dp, addr, next, floor, ceiling);
 	} while (addr = next, addr < end);
+
+	if (!pgtable_l5_enabled())
+		return;
+
+	if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+		return;
+
+	/*
+	 * Check whether we can free the p4d page if the rest of the
+	 * entries are empty. Overlap with other regions have been
+	 * handled by the floor/ceiling check.
+	 */
+	p4dp = p4d_offset(pgdp, 0UL);
+	for (i = 0; i < PTRS_PER_P4D; i++) {
+		if (!p4d_none(READ_ONCE(p4dp[i])))
+			return;
+	}
+
+	pgd_clear(pgdp);
+	__flush_tlb_kernel_pgtable(start);
+	free_hotplug_pgtable_page(virt_to_page(p4dp));
 }
 
 static void free_empty_tables(unsigned long addr, unsigned long end,
@@ -1145,6 +1166,12 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 	return 1;
 }
 
+#ifndef __PAGETABLE_P4D_FOLDED
+void p4d_clear_huge(p4d_t *p4dp)
+{
+}
+#endif
+
 int pud_clear_huge(pud_t *pudp)
 {
 	if (!pud_sect(READ_ONCE(*pudp)))
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 4a64089e5771..3c4f8a279d2b 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -17,11 +17,20 @@
 
 static struct kmem_cache *pgd_cache __ro_after_init;
 
+static bool pgdir_is_page_size(void)
+{
+	if (PGD_SIZE == PAGE_SIZE)
+		return true;
+	if (CONFIG_PGTABLE_LEVELS == 5)
+		return !pgtable_l5_enabled();
+	return false;
+}
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	gfp_t gfp = GFP_PGTABLE_USER;
 
-	if (PGD_SIZE == PAGE_SIZE)
+	if (pgdir_is_page_size())
 		return (pgd_t *)__get_free_page(gfp);
 	else
 		return kmem_cache_alloc(pgd_cache, gfp);
@@ -29,7 +38,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	if (PGD_SIZE == PAGE_SIZE)
+	if (pgdir_is_page_size())
 		free_page((unsigned long)pgd);
 	else
 		kmem_cache_free(pgd_cache, pgd);
@@ -37,7 +46,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 void __init pgtable_cache_init(void)
 {
-	if (PGD_SIZE == PAGE_SIZE)
+	if (pgdir_is_page_size())
 		return;
 
 #ifdef CONFIG_ARM64_PA_BITS_52

From 2b6c8f96cc47eb1b41f7ebf28dfc2459c39f7fa9 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:18 +0100
Subject: [PATCH 065/134] arm64: mm: add LPA2 and 5 level paging support to
 G-to-nG conversion

Add support for 5 level paging in the G-to-nG routine that creates its
own temporary page tables to traverse the swapper page tables. Also add
support for running the 5 level configuration with the top level folded
at runtime, to support CPUs that do not implement the LPA2 extension.

While at it, wire up the level skipping logic so it will also trigger on
4 level configurations with LPA2 enabled at build time but not active at
runtime, as we'll fall back to 3 level paging in that case.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-77-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c |  9 +++--
 arch/arm64/mm/proc.S           | 70 +++++++++++++++++++++++++++++-----
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index ed9670d8360c..bc5e4e569864 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1765,6 +1765,9 @@ static int __init __kpti_install_ng_mappings(void *__unused)
 	pgd_t *kpti_ng_temp_pgd;
 	u64 alloc = 0;
 
+	if (levels == 5 && !pgtable_l5_enabled())
+		levels = 4;
+
 	remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
 
 	if (!cpu) {
@@ -1778,9 +1781,9 @@ static int __init __kpti_install_ng_mappings(void *__unused)
 		//
 		// The physical pages are laid out as follows:
 		//
-		// +--------+-/-------+-/------ +-\\--------+
-		// :  PTE[] : | PMD[] : | PUD[] : || PGD[]  :
-		// +--------+-\-------+-\------ +-//--------+
+		// +--------+-/-------+-/------ +-/------ +-\\\--------+
+		// :  PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[]  :
+		// +--------+-\-------+-\------ +-\------ +-///--------+
 		//      ^
 		// The first page is mapped into this hierarchy at a PMD_SHIFT
 		// aligned virtual address, so that we can manipulate the PTE
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index d03434b7bca5..fa0d7c63f8d2 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -216,16 +216,15 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
 	.macro	kpti_mk_tbl_ng, type, num_entries
 	add	end_\type\()p, cur_\type\()p, #\num_entries * 8
 .Ldo_\type:
-	ldr	\type, [cur_\type\()p]		// Load the entry
+	ldr	\type, [cur_\type\()p], #8	// Load the entry and advance
 	tbz	\type, #0, .Lnext_\type		// Skip invalid and
 	tbnz	\type, #11, .Lnext_\type	// non-global entries
 	orr	\type, \type, #PTE_NG		// Same bit for blocks and pages
-	str	\type, [cur_\type\()p]		// Update the entry
+	str	\type, [cur_\type\()p, #-8]	// Update the entry
 	.ifnc	\type, pte
 	tbnz	\type, #1, .Lderef_\type
 	.endif
 .Lnext_\type:
-	add	cur_\type\()p, cur_\type\()p, #8
 	cmp	cur_\type\()p, end_\type\()p
 	b.ne	.Ldo_\type
 	.endm
@@ -235,18 +234,18 @@ SYM_FUNC_ALIAS(__pi_idmap_cpu_replace_ttbr1, idmap_cpu_replace_ttbr1)
 	 * fixmap slot associated with the current level.
 	 */
 	.macro	kpti_map_pgtbl, type, level
-	str	xzr, [temp_pte, #8 * (\level + 1)]	// break before make
+	str	xzr, [temp_pte, #8 * (\level + 2)]	// break before make
 	dsb	nshst
-	add	pte, temp_pte, #PAGE_SIZE * (\level + 1)
+	add	pte, temp_pte, #PAGE_SIZE * (\level + 2)
 	lsr	pte, pte, #12
 	tlbi	vaae1, pte
 	dsb	nsh
 	isb
 
 	phys_to_pte pte, cur_\type\()p
-	add	cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 1)
+	add	cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 2)
 	orr	pte, pte, pte_flags
-	str	pte, [temp_pte, #8 * (\level + 1)]
+	str	pte, [temp_pte, #8 * (\level + 2)]
 	dsb	nshst
 	.endm
 
@@ -279,6 +278,8 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 	end_ptep	.req	x15
 	pte		.req	x16
 	valid		.req	x17
+	cur_p4dp	.req	x19
+	end_p4dp	.req	x20
 
 	mov	x5, x3				// preserve temp_pte arg
 	mrs	swapper_ttb, ttbr1_el1
@@ -286,6 +287,12 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 
 	cbnz	cpu, __idmap_kpti_secondary
 
+#if CONFIG_PGTABLE_LEVELS > 4
+	stp	x29, x30, [sp, #-32]!
+	mov	x29, sp
+	stp	x19, x20, [sp, #16]
+#endif
+
 	/* We're the boot CPU. Wait for the others to catch up */
 	sevl
 1:	wfe
@@ -303,9 +310,32 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 	mov_q	pte_flags, KPTI_NG_PTE_FLAGS
 
 	/* Everybody is enjoying the idmap, so we can rewrite swapper. */
+
+#ifdef CONFIG_ARM64_LPA2
+	/*
+	 * If LPA2 support is configured, but 52-bit virtual addressing is not
+	 * enabled at runtime, we will fall back to one level of paging less,
+	 * and so we have to walk swapper_pg_dir as if we dereferenced its
+	 * address from a PGD level entry, and terminate the PGD level loop
+	 * right after.
+	 */
+	adrp	pgd, swapper_pg_dir	// walk &swapper_pg_dir at the next level
+	mov	cur_pgdp, end_pgdp	// must be equal to terminate the PGD loop
+alternative_if_not ARM64_HAS_VA52
+	b	.Lderef_pgd		// skip to the next level
+alternative_else_nop_endif
+	/*
+	 * LPA2 based 52-bit virtual addressing requires 52-bit physical
+	 * addressing to be enabled as well. In this case, the shareability
+	 * bits are repurposed as physical address bits, and should not be
+	 * set in pte_flags.
+	 */
+	bic	pte_flags, pte_flags, #PTE_SHARED
+#endif
+
 	/* PGD */
 	adrp		cur_pgdp, swapper_pg_dir
-	kpti_map_pgtbl	pgd, 0
+	kpti_map_pgtbl	pgd, -1
 	kpti_mk_tbl_ng	pgd, PTRS_PER_PGD
 
 	/* Ensure all the updated entries are visible to secondary CPUs */
@@ -318,16 +348,33 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 
 	/* Set the flag to zero to indicate that we're all done */
 	str	wzr, [flag_ptr]
+#if CONFIG_PGTABLE_LEVELS > 4
+	ldp	x19, x20, [sp, #16]
+	ldp	x29, x30, [sp], #32
+#endif
 	ret
 
 .Lderef_pgd:
+	/* P4D */
+	.if		CONFIG_PGTABLE_LEVELS > 4
+	p4d		.req	x30
+	pte_to_phys	cur_p4dp, pgd
+	kpti_map_pgtbl	p4d, 0
+	kpti_mk_tbl_ng	p4d, PTRS_PER_P4D
+	b		.Lnext_pgd
+	.else		/* CONFIG_PGTABLE_LEVELS <= 4 */
+	p4d		.req	pgd
+	.set		.Lnext_p4d, .Lnext_pgd
+	.endif
+
+.Lderef_p4d:
 	/* PUD */
 	.if		CONFIG_PGTABLE_LEVELS > 3
 	pud		.req	x10
-	pte_to_phys	cur_pudp, pgd
+	pte_to_phys	cur_pudp, p4d
 	kpti_map_pgtbl	pud, 1
 	kpti_mk_tbl_ng	pud, PTRS_PER_PUD
-	b		.Lnext_pgd
+	b		.Lnext_p4d
 	.else		/* CONFIG_PGTABLE_LEVELS <= 3 */
 	pud		.req	pgd
 	.set		.Lnext_pud, .Lnext_pgd
@@ -371,6 +418,9 @@ SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 	.unreq	end_ptep
 	.unreq	pte
 	.unreq	valid
+	.unreq	cur_p4dp
+	.unreq	end_p4dp
+	.unreq	p4d
 
 	/* Secondary CPUs end up here */
 __idmap_kpti_secondary:

From 9684ec186f8fadde52d6b6eaf64ca508897d0c71 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:19 +0100
Subject: [PATCH 066/134] arm64: Enable LPA2 at boot if supported by the system

Update the early kernel mapping code to take 52-bit virtual addressing
into account based on the LPA2 feature. This is a bit more involved than
LVA (which is supported with 64k pages only), given that some page table
descriptor bits change meaning in this case.

To keep the handling in asm to a minimum, the initial ID map is still
created with 48-bit virtual addressing, which implies that the kernel
image must be loaded into 48-bit addressable physical memory. This is
currently required by the boot protocol, even though we happen to
support placement outside of that for LVA/64k based configurations.

Enabling LPA2 involves more than setting TCR.T1SZ to a lower value,
there is also a DS bit in TCR that needs to be set, and which changes
the meaning of bits [9:8] in all page table descriptors. Since we cannot
enable DS and every live page table descriptor at the same time, let's
pivot through another temporary mapping. This avoids the need to
reintroduce manipulations of the page tables with the MMU and caches
disabled.

To permit the LPA2 feature to be overridden on the kernel command line,
which may be necessary to work around silicon errata, or to deal with
mismatched features on heterogeneous SoC designs, test for CPU feature
overrides first, and only then enable LPA2.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-78-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h  |  8 +++-
 arch/arm64/include/asm/cpufeature.h | 18 ++++++++
 arch/arm64/include/asm/memory.h     |  4 ++
 arch/arm64/kernel/head.S            |  8 ++++
 arch/arm64/kernel/image-vars.h      |  1 +
 arch/arm64/kernel/pi/map_kernel.c   | 70 +++++++++++++++++++++++++++--
 arch/arm64/kernel/pi/map_range.c    | 11 +++--
 arch/arm64/kernel/pi/pi.h           |  4 +-
 arch/arm64/mm/init.c                |  2 +-
 arch/arm64/mm/mmu.c                 |  6 ++-
 arch/arm64/mm/proc.S                |  3 ++
 11 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 7eedcb36ebe0..ce7b95cd6e79 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -581,11 +581,17 @@ alternative_endif
  * but we have to add an offset so that the TTBR1 address corresponds with the
  * pgdir entry that covers the lowest 48-bit addressable VA.
  *
+ * Note that this trick is only used for LVA/64k pages - LPA2/4k pages uses an
+ * additional paging level, and on LPA2/16k pages, we would end up with a root
+ * level table with only 2 entries, which is suboptimal in terms of TLB
+ * utilization, so there we fall back to 47 bits of translation if LPA2 is not
+ * supported.
+ *
  * orr is used as it can cover the immediate value (and is idempotent).
  * 	ttbr: Value of ttbr to set, modified.
  */
 	.macro	offset_ttbr1, ttbr, tmp
-#ifdef CONFIG_ARM64_VA_BITS_52
+#if defined(CONFIG_ARM64_VA_BITS_52) && !defined(CONFIG_ARM64_LPA2)
 	mrs	\tmp, tcr_el1
 	and	\tmp, \tmp, #TCR_T1SZ_MASK
 	cmp	\tmp, #TCR_T1SZ(VA_BITS_MIN)
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index a2ac31aecdd9..a8f97690ce1f 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -1008,6 +1008,24 @@ static inline bool cpu_has_lva(void)
 						    ID_AA64MMFR2_EL1_VARange_SHIFT);
 }
 
+static inline bool cpu_has_lpa2(void)
+{
+#ifdef CONFIG_ARM64_LPA2
+	u64 mmfr0;
+	int feat;
+
+	mmfr0 = read_sysreg(id_aa64mmfr0_el1);
+	mmfr0 &= ~id_aa64mmfr0_override.mask;
+	mmfr0 |= id_aa64mmfr0_override.val;
+	feat = cpuid_feature_extract_signed_field(mmfr0,
+						  ID_AA64MMFR0_EL1_TGRAN_SHIFT);
+
+	return feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2;
+#else
+	return false;
+#endif
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 9680d7444b3b..b850b1b91471 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -54,7 +54,11 @@
 #define FIXADDR_TOP		(-UL(SZ_8M))
 
 #if VA_BITS > 48
+#ifdef CONFIG_ARM64_16K_PAGES
+#define VA_BITS_MIN		(47)
+#else
 #define VA_BITS_MIN		(48)
+#endif
 #else
 #define VA_BITS_MIN		(VA_BITS)
 #endif
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index e25351addfd0..405e9bce8c73 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -89,6 +89,7 @@ SYM_CODE_START(primary_entry)
 	mov	sp, x1
 	mov	x29, xzr
 	adrp	x0, init_idmap_pg_dir
+	mov	x1, xzr
 	bl	__pi_create_init_idmap
 
 	/*
@@ -473,9 +474,16 @@ SYM_FUNC_END(__enable_mmu)
 
 #ifdef CONFIG_ARM64_VA_BITS_52
 SYM_FUNC_START(__cpu_secondary_check52bitva)
+#ifndef CONFIG_ARM64_LPA2
 	mrs_s	x0, SYS_ID_AA64MMFR2_EL1
 	and	x0, x0, ID_AA64MMFR2_EL1_VARange_MASK
 	cbnz	x0, 2f
+#else
+	mrs	x0, id_aa64mmfr0_el1
+	sbfx	x0, x0, #ID_AA64MMFR0_EL1_TGRAN_SHIFT, 4
+	cmp	x0, #ID_AA64MMFR0_EL1_TGRAN_LPA2
+	b.ge	2f
+#endif
 
 	update_early_cpu_boot_status \
 		CPU_STUCK_IN_KERNEL | CPU_STUCK_REASON_52_BIT_VA, x0, x1
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index ff81f809a240..ba4f8f7d6a91 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -54,6 +54,7 @@ PROVIDE(__pi__ctype			= _ctype);
 PROVIDE(__pi_memstart_offset_seed	= memstart_offset_seed);
 
 PROVIDE(__pi_init_idmap_pg_dir		= init_idmap_pg_dir);
+PROVIDE(__pi_init_idmap_pg_end		= init_idmap_pg_end);
 PROVIDE(__pi_init_pg_dir		= init_pg_dir);
 PROVIDE(__pi_init_pg_end		= init_pg_end);
 PROVIDE(__pi_swapper_pg_dir		= swapper_pg_dir);
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index 1853825aa29d..5fa08e13e17e 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -127,11 +127,64 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 	}
 
 	/* Copy the root page table to its final location */
-	memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PGD_SIZE);
+	memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE);
 	dsb(ishst);
 	idmap_cpu_replace_ttbr1(swapper_pg_dir);
 }
 
+static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr)
+{
+	u64 sctlr = read_sysreg(sctlr_el1);
+	u64 tcr = read_sysreg(tcr_el1) | TCR_DS;
+
+	asm("	msr	sctlr_el1, %0		;"
+	    "	isb				;"
+	    "   msr     ttbr0_el1, %1		;"
+	    "   msr     tcr_el1, %2		;"
+	    "	isb				;"
+	    "	tlbi    vmalle1			;"
+	    "	dsb     nsh			;"
+	    "	isb				;"
+	    "	msr     sctlr_el1, %3		;"
+	    "	isb				;"
+	    ::	"r"(sctlr & ~SCTLR_ELx_M), "r"(ttbr), "r"(tcr), "r"(sctlr));
+}
+
+static void __init remap_idmap_for_lpa2(void)
+{
+	/* clear the bits that change meaning once LPA2 is turned on */
+	pteval_t mask = PTE_SHARED;
+
+	/*
+	 * We have to clear bits [9:8] in all block or page descriptors in the
+	 * initial ID map, as otherwise they will be (mis)interpreted as
+	 * physical address bits once we flick the LPA2 switch (TCR.DS). Since
+	 * we cannot manipulate live descriptors in that way without creating
+	 * potential TLB conflicts, let's create another temporary ID map in a
+	 * LPA2 compatible fashion, and update the initial ID map while running
+	 * from that.
+	 */
+	create_init_idmap(init_pg_dir, mask);
+	dsb(ishst);
+	set_ttbr0_for_lpa2((u64)init_pg_dir);
+
+	/*
+	 * Recreate the initial ID map with the same granularity as before.
+	 * Don't bother with the FDT, we no longer need it after this.
+	 */
+	memset(init_idmap_pg_dir, 0,
+	       (u64)init_idmap_pg_dir - (u64)init_idmap_pg_end);
+
+	create_init_idmap(init_idmap_pg_dir, mask);
+	dsb(ishst);
+
+	/* switch back to the updated initial ID map */
+	set_ttbr0_for_lpa2((u64)init_idmap_pg_dir);
+
+	/* wipe the temporary ID map from memory */
+	memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir);
+}
+
 static void __init map_fdt(u64 fdt)
 {
 	static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE);
@@ -154,6 +207,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	u64 va_base, pa_base = (u64)&_text;
 	u64 kaslr_offset = pa_base % MIN_KIMG_ALIGN;
 	int root_level = 4 - CONFIG_PGTABLE_LEVELS;
+	int va_bits = VA_BITS;
 	int chosen;
 
 	map_fdt((u64)fdt);
@@ -165,8 +219,15 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	chosen = fdt_path_offset(fdt, chosen_str);
 	init_feature_override(boot_status, fdt, chosen);
 
-	if (VA_BITS > VA_BITS_MIN && cpu_has_lva())
-		sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(VA_BITS));
+	if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) {
+		va_bits = VA_BITS_MIN;
+	} else if (IS_ENABLED(CONFIG_ARM64_LPA2) && !cpu_has_lpa2()) {
+		va_bits = VA_BITS_MIN;
+		root_level++;
+	}
+
+	if (va_bits > VA_BITS_MIN)
+		sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits));
 
 	/*
 	 * The virtual KASLR displacement modulo 2MiB is decided by the
@@ -184,6 +245,9 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 		kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1);
 	}
 
+	if (IS_ENABLED(CONFIG_ARM64_LPA2) && va_bits > VA_BITS_MIN)
+		remap_idmap_for_lpa2();
+
 	va_base = KIMAGE_VADDR + kaslr_offset;
 	map_kernel(kaslr_offset, va_base - pa_base, root_level);
 }
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index 79e4f6a2efe1..5410b2cac590 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -87,14 +87,19 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
 	}
 }
 
-asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir)
+asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask)
 {
 	u64 ptep = (u64)pg_dir + PAGE_SIZE;
+	pgprot_t text_prot = PAGE_KERNEL_ROX;
+	pgprot_t data_prot = PAGE_KERNEL;
+
+	pgprot_val(text_prot) &= ~clrmask;
+	pgprot_val(data_prot) &= ~clrmask;
 
 	map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext,
-		  PAGE_KERNEL_ROX, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
+		  text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
 	map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin,
-		  PAGE_KERNEL, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
+		  data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0);
 
 	return ptep;
 }
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
index 1ea282a5f96a..c91e5e965cd3 100644
--- a/arch/arm64/kernel/pi/pi.h
+++ b/arch/arm64/kernel/pi/pi.h
@@ -21,7 +21,7 @@ static inline void *prel64_to_pointer(const prel64_t *offset)
 
 extern bool dynamic_scs_is_enabled;
 
-extern pgd_t init_idmap_pg_dir[];
+extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[];
 
 void init_feature_override(u64 boot_status, const void *fdt, int chosen);
 u64 kaslr_early_init(void *fdt, int chosen);
@@ -33,4 +33,4 @@ void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
 
 asmlinkage void early_map_kernel(u64 boot_status, void *fdt);
 
-asmlinkage u64 create_init_idmap(pgd_t *pgd);
+asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 74c1db8ce271..0f427b50fdc3 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -238,7 +238,7 @@ void __init arm64_memblock_init(void)
 	 * physical address of PAGE_OFFSET, we have to *subtract* from it.
 	 */
 	if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
-		memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52);
+		memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);
 
 	/*
 	 * Apply the memory limit if it was set. Since the kernel may be loaded
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d2e9dec38a15..d30ae4d3fdd9 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -582,8 +582,12 @@ static void __init map_mem(pgd_t *pgdp)
 	 * entries at any level are being shared between the linear region and
 	 * the vmalloc region. Check whether this is true for the PGD level, in
 	 * which case it is guaranteed to be true for all other levels as well.
+	 * (Unless we are running with support for LPA2, in which case the
+	 * entire reduced VA space is covered by a single pgd_t which will have
+	 * been populated without the PXNTable attribute by the time we get here.)
 	 */
-	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
+	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) &&
+		     pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1);
 
 	early_kfence_pool = arm64_kfence_alloc_pool();
 
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index fa0d7c63f8d2..9d40f3ffd8d2 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -488,6 +488,9 @@ SYM_FUNC_START(__cpu_setup)
 	mov		x9, #64 - VA_BITS
 alternative_if ARM64_HAS_VA52
 	tcr_set_t1sz	tcr, x9
+#ifdef CONFIG_ARM64_LPA2
+	orr		tcr, tcr, #TCR_DS
+#endif
 alternative_else_nop_endif
 #endif
 

From 6ed8a3a094b43a27ac35e5c95a8004a1d83d1b79 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:20 +0100
Subject: [PATCH 067/134] arm64: mm: Add 5 level paging support to fixmap and
 swapper handling

Add support for using 5 levels of paging in the fixmap, as well as in
the kernel page table handling code which uses fixmaps internally.
This also handles the case where a 5 level build runs on hardware that
only supports 4 levels of paging.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-79-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fixmap.h  |  1 +
 arch/arm64/include/asm/pgtable.h | 45 ++++++++++++++++++++++++++----
 arch/arm64/mm/fixmap.c           |  2 +-
 arch/arm64/mm/mmu.c              | 47 +++++++++++++++++++++++++++++---
 4 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 8aabd45e9a13..87e307804b99 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -87,6 +87,7 @@ enum fixed_addresses {
 	FIX_PTE,
 	FIX_PMD,
 	FIX_PUD,
+	FIX_P4D,
 	FIX_PGD,
 
 	__end_of_fixed_addresses
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7eb2b933ed3c..3d7fb3cde83d 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -621,12 +621,12 @@ static inline bool pud_table(pud_t pud) { return true; }
 				 PUD_TYPE_TABLE)
 #endif
 
-extern pgd_t init_pg_dir[PTRS_PER_PGD];
+extern pgd_t init_pg_dir[];
 extern pgd_t init_pg_end[];
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
-extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
-extern pgd_t reserved_pg_dir[PTRS_PER_PGD];
+extern pgd_t swapper_pg_dir[];
+extern pgd_t idmap_pg_dir[];
+extern pgd_t tramp_pg_dir[];
+extern pgd_t reserved_pg_dir[];
 
 extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd);
 
@@ -891,12 +891,47 @@ static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr)
 	return p4d_offset_lockless(pgdp, READ_ONCE(*pgdp), addr);
 }
 
+static inline p4d_t *p4d_set_fixmap(unsigned long addr)
+{
+	if (!pgtable_l5_enabled())
+		return NULL;
+	return (p4d_t *)set_fixmap_offset(FIX_P4D, addr);
+}
+
+static inline p4d_t *p4d_set_fixmap_offset(pgd_t *pgdp, unsigned long addr)
+{
+	if (!pgtable_l5_enabled())
+		return pgd_to_folded_p4d(pgdp, addr);
+	return p4d_set_fixmap(p4d_offset_phys(pgdp, addr));
+}
+
+static inline void p4d_clear_fixmap(void)
+{
+	if (pgtable_l5_enabled())
+		clear_fixmap(FIX_P4D);
+}
+
+/* use ONLY for statically allocated translation tables */
+static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr)
+{
+	if (!pgtable_l5_enabled())
+		return pgd_to_folded_p4d(pgdp, addr);
+	return (p4d_t *)__phys_to_kimg(p4d_offset_phys(pgdp, addr));
+}
+
 #define pgd_page(pgd)		pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))
 
 #else
 
 static inline bool pgtable_l5_enabled(void) { return false; }
 
+/* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */
+#define p4d_set_fixmap(addr)		NULL
+#define p4d_set_fixmap_offset(p4dp, addr)	((p4d_t *)p4dp)
+#define p4d_clear_fixmap()
+
+#define p4d_offset_kimg(dir,addr)	((p4d_t *)dir)
+
 #endif  /* CONFIG_PGTABLE_LEVELS > 4 */
 
 #define pgd_ERROR(e)	\
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index 9404f282f829..d22506e9c7fd 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -104,7 +104,7 @@ void __init early_fixmap_init(void)
 	unsigned long end = FIXADDR_TOP;
 
 	pgd_t *pgdp = pgd_offset_k(addr);
-	p4d_t *p4dp = p4d_offset(pgdp, addr);
+	p4d_t *p4dp = p4d_offset_kimg(pgdp, addr);
 
 	early_fixmap_init_pud(p4dp, addr, end);
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d30ae4d3fdd9..8e5b3a7c5afd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -313,15 +313,14 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 	} while (addr = next, addr != end);
 }
 
-static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
+static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 			   phys_addr_t phys, pgprot_t prot,
 			   phys_addr_t (*pgtable_alloc)(int),
 			   int flags)
 {
 	unsigned long next;
-	pud_t *pudp;
-	p4d_t *p4dp = p4d_offset(pgdp, addr);
 	p4d_t p4d = READ_ONCE(*p4dp);
+	pud_t *pudp;
 
 	if (p4d_none(p4d)) {
 		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
@@ -369,6 +368,46 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 	pud_clear_fixmap();
 }
 
+static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
+			   phys_addr_t phys, pgprot_t prot,
+			   phys_addr_t (*pgtable_alloc)(int),
+			   int flags)
+{
+	unsigned long next;
+	pgd_t pgd = READ_ONCE(*pgdp);
+	p4d_t *p4dp;
+
+	if (pgd_none(pgd)) {
+		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN;
+		phys_addr_t p4d_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			pgdval |= PGD_TABLE_PXN;
+		BUG_ON(!pgtable_alloc);
+		p4d_phys = pgtable_alloc(P4D_SHIFT);
+		__pgd_populate(pgdp, p4d_phys, pgdval);
+		pgd = READ_ONCE(*pgdp);
+	}
+	BUG_ON(pgd_bad(pgd));
+
+	p4dp = p4d_set_fixmap_offset(pgdp, addr);
+	do {
+		p4d_t old_p4d = READ_ONCE(*p4dp);
+
+		next = p4d_addr_end(addr, end);
+
+		alloc_init_pud(p4dp, addr, next, phys, prot,
+			       pgtable_alloc, flags);
+
+		BUG_ON(p4d_val(old_p4d) != 0 &&
+		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
+
+		phys += next - addr;
+	} while (p4dp++, addr = next, addr != end);
+
+	p4d_clear_fixmap();
+}
+
 static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 					unsigned long virt, phys_addr_t size,
 					pgprot_t prot,
@@ -391,7 +430,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
+		alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
 			       flags);
 		phys += next - addr;
 	} while (pgdp++, addr = next, addr != end);

From 0383808e4d99ac31892655ae9dc93597eb6f1412 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:21 +0100
Subject: [PATCH 068/134] arm64: kasan: Reduce minimum shadow alignment and
 enable 5 level paging

Allow the KASAN init code to deal with 5 levels of paging, and relax the
requirement that the shadow region is aligned to the top level pgd_t
size. This is necessary for LPA2 based 52-bit virtual addressing, where
the KASAN shadow will never be aligned to the pgd_t size. Allowing this
also enables the 16k/48-bit case for KASAN, which is a nice bonus.

This involves some hackery to manipulate the root and next level page
tables without having to distinguish all the various configurations,
including 16k/48-bits (which has a two entry pgd_t level), and LPA2
configurations running with one translation level less on non-LPA2
hardware.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-80-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig         |   2 +-
 arch/arm64/mm/kasan_init.c | 148 ++++++++++++++++++++++++++++++++-----
 2 files changed, 130 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 8c2c36fffcf5..9ca3316d6379 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -164,7 +164,7 @@ config ARM64
 	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
-	select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
+	select HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE)
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index a86ab99587c9..fbddbf9faf19 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -23,7 +23,7 @@
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
-static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
+static pgd_t tmp_pg_dir[PTRS_PER_PTE] __initdata __aligned(PAGE_SIZE);
 
 /*
  * The p*d_populate functions call virt_to_phys implicitly so they can't be used
@@ -99,6 +99,19 @@ static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node,
 	return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr);
 }
 
+static p4d_t *__init kasan_p4d_offset(pgd_t *pgdp, unsigned long addr, int node,
+				      bool early)
+{
+	if (pgd_none(READ_ONCE(*pgdp))) {
+		phys_addr_t p4d_phys = early ?
+				__pa_symbol(kasan_early_shadow_p4d)
+					: kasan_alloc_zeroed_page(node);
+		__pgd_populate(pgdp, p4d_phys, PGD_TYPE_TABLE);
+	}
+
+	return early ? p4d_offset_kimg(pgdp, addr) : p4d_offset(pgdp, addr);
+}
+
 static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 				      unsigned long end, int node, bool early)
 {
@@ -144,12 +157,12 @@ static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr,
 				      unsigned long end, int node, bool early)
 {
 	unsigned long next;
-	p4d_t *p4dp = p4d_offset(pgdp, addr);
+	p4d_t *p4dp = kasan_p4d_offset(pgdp, addr, node, early);
 
 	do {
 		next = p4d_addr_end(addr, end);
 		kasan_pud_populate(p4dp, addr, next, node, early);
-	} while (p4dp++, addr = next, addr != end);
+	} while (p4dp++, addr = next, addr != end && p4d_none(READ_ONCE(*p4dp)));
 }
 
 static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
@@ -165,19 +178,48 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
 	} while (pgdp++, addr = next, addr != end);
 }
 
+#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS > 4
+#define SHADOW_ALIGN	P4D_SIZE
+#else
+#define SHADOW_ALIGN	PUD_SIZE
+#endif
+
+/*
+ * Return whether 'addr' is aligned to the size covered by a root level
+ * descriptor.
+ */
+static bool __init root_level_aligned(u64 addr)
+{
+	int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3);
+
+	return (addr % (PAGE_SIZE << shift)) == 0;
+}
+
 /* The early shadow maps everything to a single page of zeroes */
 asmlinkage void __init kasan_early_init(void)
 {
 	BUILD_BUG_ON(KASAN_SHADOW_OFFSET !=
 		KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
-	/*
-	 * We cannot check the actual value of KASAN_SHADOW_START during build,
-	 * as it depends on vabits_actual. As a best-effort approach, check
-	 * potential values calculated based on VA_BITS and VA_BITS_MIN.
-	 */
-	BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS), PGDIR_SIZE));
-	BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS_MIN), PGDIR_SIZE));
-	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+	BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS), SHADOW_ALIGN));
+	BUILD_BUG_ON(!IS_ALIGNED(_KASAN_SHADOW_START(VA_BITS_MIN), SHADOW_ALIGN));
+	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, SHADOW_ALIGN));
+
+	if (!root_level_aligned(KASAN_SHADOW_START)) {
+		/*
+		 * The start address is misaligned, and so the next level table
+		 * will be shared with the linear region. This can happen with
+		 * 4 or 5 level paging, so install a generic pte_t[] as the
+		 * next level. This prevents the kasan_pgd_populate call below
+		 * from inserting an entry that refers to the shared KASAN zero
+		 * shadow pud_t[]/p4d_t[], which could end up getting corrupted
+		 * when the linear region is mapped.
+		 */
+		static pte_t tbl[PTRS_PER_PTE] __page_aligned_bss;
+		pgd_t *pgdp = pgd_offset_k(KASAN_SHADOW_START);
+
+		set_pgd(pgdp, __pgd(__pa_symbol(tbl) | PGD_TYPE_TABLE));
+	}
+
 	kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE,
 			   true);
 }
@@ -189,20 +231,75 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end,
 	kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
 }
 
-static void __init clear_pgds(unsigned long start,
-			unsigned long end)
+/*
+ * Return the descriptor index of 'addr' in the root level table
+ */
+static int __init root_level_idx(u64 addr)
 {
 	/*
-	 * Remove references to kasan page tables from
-	 * swapper_pg_dir. pgd_clear() can't be used
-	 * here because it's nop on 2,3-level pagetable setups
+	 * On 64k pages, the TTBR1 range root tables are extended for 52-bit
+	 * virtual addressing, and TTBR1 will simply point to the pgd_t entry
+	 * that covers the start of the 48-bit addressable VA space if LVA is
+	 * not implemented. This means we need to index the table as usual,
+	 * instead of masking off bits based on vabits_actual.
 	 */
-	for (; start < end; start += PGDIR_SIZE)
-		set_pgd(pgd_offset_k(start), __pgd(0));
+	u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS
+							: vabits_actual;
+	int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3);
+
+	return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT);
+}
+
+/*
+ * Clone a next level table from swapper_pg_dir into tmp_pg_dir
+ */
+static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud)
+{
+	int idx = root_level_idx(addr);
+	pgd_t pgd = READ_ONCE(swapper_pg_dir[idx]);
+	pud_t *pudp = (pud_t *)__phys_to_kimg(__pgd_to_phys(pgd));
+
+	memcpy(pud, pudp, PAGE_SIZE);
+	tmp_pg_dir[idx] = __pgd(__phys_to_pgd_val(__pa_symbol(pud)) |
+				PUD_TYPE_TABLE);
+}
+
+/*
+ * Return the descriptor index of 'addr' in the next level table
+ */
+static int __init next_level_idx(u64 addr)
+{
+	int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3);
+
+	return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE;
+}
+
+/*
+ * Dereference the table descriptor at 'pgd_idx' and clear the entries from
+ * 'start' to 'end' (exclusive) from the table.
+ */
+static void __init clear_next_level(int pgd_idx, int start, int end)
+{
+	pgd_t pgd = READ_ONCE(swapper_pg_dir[pgd_idx]);
+	pud_t *pudp = (pud_t *)__phys_to_kimg(__pgd_to_phys(pgd));
+
+	memset(&pudp[start], 0, (end - start) * sizeof(pud_t));
+}
+
+static void __init clear_shadow(u64 start, u64 end)
+{
+	int l = root_level_idx(start), m = root_level_idx(end);
+
+	if (!root_level_aligned(start))
+		clear_next_level(l++, next_level_idx(start), PTRS_PER_PTE);
+	if (!root_level_aligned(end))
+		clear_next_level(m, 0, next_level_idx(end));
+	memset(&swapper_pg_dir[l], 0, (m - l) * sizeof(pgd_t));
 }
 
 static void __init kasan_init_shadow(void)
 {
+	static pud_t pud[2][PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
 	u64 kimg_shadow_start, kimg_shadow_end;
 	u64 mod_shadow_start;
 	u64 vmalloc_shadow_end;
@@ -224,10 +321,23 @@ static void __init kasan_init_shadow(void)
 	 * setup will be finished.
 	 */
 	memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
+
+	/*
+	 * If the start or end address of the shadow region is not aligned to
+	 * the root level size, we have to allocate a temporary next-level table
+	 * in each case, clone the next level of descriptors, and install the
+	 * table into tmp_pg_dir. Note that with 5 levels of paging, the next
+	 * level will in fact be p4d_t, but that makes no difference in this
+	 * case.
+	 */
+	if (!root_level_aligned(KASAN_SHADOW_START))
+		clone_next_level(KASAN_SHADOW_START, tmp_pg_dir, pud[0]);
+	if (!root_level_aligned(KASAN_SHADOW_END))
+		clone_next_level(KASAN_SHADOW_END, tmp_pg_dir, pud[1]);
 	dsb(ishst);
 	cpu_replace_ttbr1(lm_alias(tmp_pg_dir));
 
-	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+	clear_shadow(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
 	kasan_map_populate(kimg_shadow_start, kimg_shadow_end,
 			   early_pfn_to_nid(virt_to_pfn(lm_alias(KERNEL_START))));

From 0dd4f60a2c76938c2625f6c630c225699d97608b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:22 +0100
Subject: [PATCH 069/134] arm64: mm: Add support for folding PUDs at runtime

In order to support LPA2 on 16k pages in a way that permits non-LPA2
systems to run the same kernel image, we have to be able to fall back to
at most 48 bits of virtual addressing.

Falling back to 48 bits would result in a level 0 with only 2 entries,
which is suboptimal in terms of TLB utilization. So instead, let's fall
back to 47 bits in that case. This means we need to be able to fold PUDs
dynamically, similar to how we fold P4Ds for 48 bit virtual addressing
on LPA2 with 4k pages.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-81-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgalloc.h | 12 ++++-
 arch/arm64/include/asm/pgtable.h | 89 +++++++++++++++++++++++++++-----
 arch/arm64/include/asm/tlb.h     |  3 ++
 arch/arm64/kernel/cpufeature.c   |  2 +
 arch/arm64/mm/mmu.c              |  2 +-
 arch/arm64/mm/pgd.c              |  2 +
 6 files changed, 96 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index cae8c648f462..aeba2cf15a25 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -14,6 +14,7 @@
 #include <asm/tlbflush.h>
 
 #define __HAVE_ARCH_PGD_FREE
+#define __HAVE_ARCH_PUD_FREE
 #include <asm-generic/pgalloc.h>
 
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
@@ -43,7 +44,8 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
 
 static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 {
-	set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
+	if (pgtable_l4_enabled())
+		set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
 }
 
 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
@@ -53,6 +55,14 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
 	p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN;
 	__p4d_populate(p4dp, __pa(pudp), p4dval);
 }
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	if (!pgtable_l4_enabled())
+		return;
+	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+	free_page((unsigned long)pud);
+}
 #else
 static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 {
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3d7fb3cde83d..b3c716fa8121 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -759,12 +759,27 @@ static inline pmd_t *pud_pgtable(pud_t pud)
 
 #if CONFIG_PGTABLE_LEVELS > 3
 
+static __always_inline bool pgtable_l4_enabled(void)
+{
+	if (CONFIG_PGTABLE_LEVELS > 4 || !IS_ENABLED(CONFIG_ARM64_LPA2))
+		return true;
+	if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT))
+		return vabits_actual == VA_BITS;
+	return alternative_has_cap_unlikely(ARM64_HAS_VA52);
+}
+
+static inline bool mm_pud_folded(const struct mm_struct *mm)
+{
+	return !pgtable_l4_enabled();
+}
+#define mm_pud_folded  mm_pud_folded
+
 #define pud_ERROR(e)	\
 	pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e))
 
-#define p4d_none(p4d)		(!p4d_val(p4d))
-#define p4d_bad(p4d)		(!(p4d_val(p4d) & 2))
-#define p4d_present(p4d)	(p4d_val(p4d))
+#define p4d_none(p4d)		(pgtable_l4_enabled() && !p4d_val(p4d))
+#define p4d_bad(p4d)		(pgtable_l4_enabled() && !(p4d_val(p4d) & 2))
+#define p4d_present(p4d)	(!p4d_none(p4d))
 
 static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
@@ -780,7 +795,8 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
 
 static inline void p4d_clear(p4d_t *p4dp)
 {
-	set_p4d(p4dp, __p4d(0));
+	if (pgtable_l4_enabled())
+		set_p4d(p4dp, __p4d(0));
 }
 
 static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
@@ -788,25 +804,74 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
 	return __p4d_to_phys(p4d);
 }
 
+#define pud_index(addr)		(((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr)
+{
+	return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr);
+}
+
 static inline pud_t *p4d_pgtable(p4d_t p4d)
 {
 	return (pud_t *)__va(p4d_page_paddr(p4d));
 }
 
-/* Find an entry in the first-level page table. */
-#define pud_offset_phys(dir, addr)	(p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))
+static inline phys_addr_t pud_offset_phys(p4d_t *p4dp, unsigned long addr)
+{
+	BUG_ON(!pgtable_l4_enabled());
 
-#define pud_set_fixmap(addr)		((pud_t *)set_fixmap_offset(FIX_PUD, addr))
-#define pud_set_fixmap_offset(p4d, addr)	pud_set_fixmap(pud_offset_phys(p4d, addr))
-#define pud_clear_fixmap()		clear_fixmap(FIX_PUD)
+	return p4d_page_paddr(READ_ONCE(*p4dp)) + pud_index(addr) * sizeof(pud_t);
+}
+
+static inline
+pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long addr)
+{
+	if (!pgtable_l4_enabled())
+		return p4d_to_folded_pud(p4dp, addr);
+	return (pud_t *)__va(p4d_page_paddr(p4d)) + pud_index(addr);
+}
+#define pud_offset_lockless pud_offset_lockless
+
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long addr)
+{
+	return pud_offset_lockless(p4dp, READ_ONCE(*p4dp), addr);
+}
+#define pud_offset	pud_offset
+
+static inline pud_t *pud_set_fixmap(unsigned long addr)
+{
+	if (!pgtable_l4_enabled())
+		return NULL;
+	return (pud_t *)set_fixmap_offset(FIX_PUD, addr);
+}
+
+static inline pud_t *pud_set_fixmap_offset(p4d_t *p4dp, unsigned long addr)
+{
+	if (!pgtable_l4_enabled())
+		return p4d_to_folded_pud(p4dp, addr);
+	return pud_set_fixmap(pud_offset_phys(p4dp, addr));
+}
+
+static inline void pud_clear_fixmap(void)
+{
+	if (pgtable_l4_enabled())
+		clear_fixmap(FIX_PUD);
+}
+
+/* use ONLY for statically allocated translation tables */
+static inline pud_t *pud_offset_kimg(p4d_t *p4dp, u64 addr)
+{
+	if (!pgtable_l4_enabled())
+		return p4d_to_folded_pud(p4dp, addr);
+	return (pud_t *)__phys_to_kimg(pud_offset_phys(p4dp, addr));
+}
 
 #define p4d_page(p4d)		pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d)))
 
-/* use ONLY for statically allocated translation tables */
-#define pud_offset_kimg(dir,addr)	((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
-
 #else
 
+static inline bool pgtable_l4_enabled(void) { return false; }
+
 #define p4d_page_paddr(p4d)	({ BUILD_BUG(); 0;})
 
 /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 0150deb332af..a947c6e784ed 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -103,6 +103,9 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
 {
 	struct ptdesc *ptdesc = virt_to_ptdesc(pudp);
 
+	if (!pgtable_l4_enabled())
+		return;
+
 	pagetable_pud_dtor(ptdesc);
 	tlb_remove_ptdesc(tlb, ptdesc);
 }
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index bc5e4e569864..94f035f6c421 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1767,6 +1767,8 @@ static int __init __kpti_install_ng_mappings(void *__unused)
 
 	if (levels == 5 && !pgtable_l5_enabled())
 		levels = 4;
+	else if (levels == 4 && !pgtable_l4_enabled())
+		levels = 3;
 
 	remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8e5b3a7c5afd..b131ed31a6c8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1065,7 +1065,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
 		free_empty_pmd_table(pudp, addr, next, floor, ceiling);
 	} while (addr = next, addr < end);
 
-	if (CONFIG_PGTABLE_LEVELS <= 3)
+	if (!pgtable_l4_enabled())
 		return;
 
 	if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK))
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 3c4f8a279d2b..0c501cabc238 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -21,6 +21,8 @@ static bool pgdir_is_page_size(void)
 {
 	if (PGD_SIZE == PAGE_SIZE)
 		return true;
+	if (CONFIG_PGTABLE_LEVELS == 4)
+		return !pgtable_l4_enabled();
 	if (CONFIG_PGTABLE_LEVELS == 5)
 		return !pgtable_l5_enabled();
 	return false;

From 16f22981b6d7ea6815d2e4527b82cac5e2f65c89 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:23 +0100
Subject: [PATCH 070/134] arm64: ptdump: Disregard unaddressable VA space

Configurations built with support for 52-bit virtual addressing can also
run on CPUs that only support 48 bits of VA space, in which case only
that part of swapper_pg_dir that represents the 48-bit addressable
region is relevant, and everything else is ignored by the hardware.

Our software pagetable walker has little in the way of input address
validation, and so it will happily start a walk from an address that is
not representable by the number of paging levels that are actually
active, resulting in lots of bogus output from the page table dumper
unless we take care to start at a valid address.

So define the start address at runtime based on vabits_actual.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-82-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/ptdump.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 5f0849528ccf..16d0cf1d85c4 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -313,7 +313,6 @@ static void __init ptdump_initialize(void)
 
 static struct ptdump_info kernel_ptdump_info __ro_after_init = {
 	.mm		= &init_mm,
-	.base_addr	= PAGE_OFFSET,
 };
 
 void ptdump_check_wx(void)
@@ -329,7 +328,7 @@ void ptdump_check_wx(void)
 		.ptdump = {
 			.note_page = note_page,
 			.range = (struct ptdump_range[]) {
-				{PAGE_OFFSET, ~0UL},
+				{_PAGE_OFFSET(vabits_actual), ~0UL},
 				{0, 0}
 			}
 		}
@@ -370,6 +369,7 @@ static int __init ptdump_init(void)
 	static struct addr_marker address_markers[ARRAY_SIZE(m)] __ro_after_init;
 
 	kernel_ptdump_info.markers = memcpy(address_markers, m, sizeof(m));
+	kernel_ptdump_info.base_addr = page_offset;
 
 	ptdump_initialize();
 	ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables");

From d40900fcb39700207823486ca512a1a87d6331e2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:24 +0100
Subject: [PATCH 071/134] arm64: ptdump: Deal with translation levels folded at
 runtime

Currently, the ptdump code deals with folded PMD or PUD levels at build
time, by omitting those levels when invoking note_page. IOW, note_page()
is never invoked with level == 1 if P4Ds are folded in the build
configuration.

With the introduction of LPA2 support, we will defer some of these
folding decisions to runtime, so let's take care of this by overriding
the 'level' argument when this condition triggers.

Substituting the PUD or PMD strings for "PGD" when the level in question
is folded at build time is no longer necessary, and so the conditional
expressions can be simplified. This also makes the indirection of the
'name' field unnecessary, so change that into a char[] array, and make
the whole thing __ro_after_init.

Note that the mm_p?d_folded() functions currently ignore their mm
pointer arguments, but let's wire them up correctly anyway.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-83-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/ptdump.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 16d0cf1d85c4..5b87f8d623f7 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -48,6 +48,7 @@ struct pg_state {
 	struct ptdump_state ptdump;
 	struct seq_file *seq;
 	const struct addr_marker *marker;
+	const struct mm_struct *mm;
 	unsigned long start_address;
 	int level;
 	u64 current_prot;
@@ -144,12 +145,12 @@ static const struct prot_bits pte_bits[] = {
 
 struct pg_level {
 	const struct prot_bits *bits;
-	const char *name;
-	size_t num;
+	char name[4];
+	int num;
 	u64 mask;
 };
 
-static struct pg_level pg_level[] = {
+static struct pg_level pg_level[] __ro_after_init = {
 	{ /* pgd */
 		.name	= "PGD",
 		.bits	= pte_bits,
@@ -159,11 +160,11 @@ static struct pg_level pg_level[] = {
 		.bits	= pte_bits,
 		.num	= ARRAY_SIZE(pte_bits),
 	}, { /* pud */
-		.name	= (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD",
+		.name	= "PUD",
 		.bits	= pte_bits,
 		.num	= ARRAY_SIZE(pte_bits),
 	}, { /* pmd */
-		.name	= (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD",
+		.name	= "PMD",
 		.bits	= pte_bits,
 		.num	= ARRAY_SIZE(pte_bits),
 	}, { /* pte */
@@ -227,6 +228,11 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 	static const char units[] = "KMGTPE";
 	u64 prot = 0;
 
+	/* check if the current level has been folded dynamically */
+	if ((level == 1 && mm_p4d_folded(st->mm)) ||
+	    (level == 2 && mm_pud_folded(st->mm)))
+		level = 0;
+
 	if (level >= 0)
 		prot = val & pg_level[level].mask;
 
@@ -288,6 +294,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
 	st = (struct pg_state){
 		.seq = s,
 		.marker = info->markers,
+		.mm = info->mm,
 		.level = -1,
 		.ptdump = {
 			.note_page = note_page,

From 95e059b5db6082e62632f40434059759c7c1f6ed Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:25 +0100
Subject: [PATCH 072/134] arm64: kvm: avoid CONFIG_PGTABLE_LEVELS for runtime
 levels

get_user_mapping_size() uses vabits_actual and CONFIG_PGTABLE_LEVELS to
provide the starting point for a table walk. This is fine for LVA, as
the number of translation levels is the same regardless of whether LVA
is enabled. However, with LPA2, this will no longer be the case, so
let's derive the number of levels from the number of VA bits directly.

Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-84-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 6fa9e816df40..cd9456a03e38 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -805,7 +805,7 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
 		.pgd		= (kvm_pteref_t)kvm->mm->pgd,
 		.ia_bits	= vabits_actual,
 		.start_level	= (KVM_PGTABLE_LAST_LEVEL -
-				   CONFIG_PGTABLE_LEVELS + 1),
+				   ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
 		.mm_ops		= &kvm_user_mm_ops,
 	};
 	unsigned long flags;

From 352b0395b5053fca01b9dc60294235511f5f3d65 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:26 +0100
Subject: [PATCH 073/134] arm64: Enable 52-bit virtual addressing for 4k and
 16k granule configs

Update Kconfig to permit 4k and 16k granule configurations to be built
with 52-bit virtual addressing, now that all the prerequisites are in
place.

While at it, update the feature description so it matches on the
appropriate feature bits depending on the page size. For simplicity,
let's just keep ARM64_HAS_VA52 as the feature name.

Note that LPA2 based 52-bit virtual addressing requires 52-bit physical
addressing support to be enabled as well, as programming TCR.TxSZ to
values below 16 is not allowed unless TCR.DS is set, which is what
activates the 52-bit physical addressing support.

While supporting the converse (52-bit physical addressing without 52-bit
virtual addressing) would be possible in principle, let's keep things
simple, by only allowing these features to be enabled at the same time.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-85-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig             | 17 ++++++++++-------
 arch/arm64/kernel/cpufeature.c | 24 +++++++++++++++++++-----
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9ca3316d6379..eed8fef08a10 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -368,7 +368,9 @@ config PGTABLE_LEVELS
 	default 3 if ARM64_64K_PAGES && (ARM64_VA_BITS_48 || ARM64_VA_BITS_52)
 	default 3 if ARM64_4K_PAGES && ARM64_VA_BITS_39
 	default 3 if ARM64_16K_PAGES && ARM64_VA_BITS_47
+	default 4 if ARM64_16K_PAGES && (ARM64_VA_BITS_48 || ARM64_VA_BITS_52)
 	default 4 if !ARM64_64K_PAGES && ARM64_VA_BITS_48
+	default 5 if ARM64_4K_PAGES && ARM64_VA_BITS_52
 
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
@@ -396,13 +398,13 @@ config BUILTIN_RETURN_ADDRESS_STRIPS_PAC
 config KASAN_SHADOW_OFFSET
 	hex
 	depends on KASAN_GENERIC || KASAN_SW_TAGS
-	default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS
-	default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS
+	default 0xdfff800000000000 if (ARM64_VA_BITS_48 || (ARM64_VA_BITS_52 && !ARM64_16K_PAGES)) && !KASAN_SW_TAGS
+	default 0xdfffc00000000000 if (ARM64_VA_BITS_47 || ARM64_VA_BITS_52) && ARM64_16K_PAGES && !KASAN_SW_TAGS
 	default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS
 	default 0xdfffffc000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS
 	default 0xdffffff800000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS
-	default 0xefff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS
-	default 0xefffc00000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS
+	default 0xefff800000000000 if (ARM64_VA_BITS_48 || (ARM64_VA_BITS_52 && !ARM64_16K_PAGES)) && KASAN_SW_TAGS
+	default 0xefffc00000000000 if (ARM64_VA_BITS_47 || ARM64_VA_BITS_52) && ARM64_16K_PAGES && KASAN_SW_TAGS
 	default 0xeffffe0000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS
 	default 0xefffffc000000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS
 	default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS
@@ -1310,7 +1312,7 @@ config ARM64_VA_BITS_48
 
 config ARM64_VA_BITS_52
 	bool "52-bit"
-	depends on ARM64_64K_PAGES && (ARM64_PAN || !ARM64_SW_TTBR0_PAN)
+	depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN
 	help
 	  Enable 52-bit virtual addressing for userspace when explicitly
 	  requested via a hint to mmap(). The kernel will also use 52-bit
@@ -1357,10 +1359,11 @@ choice
 
 config ARM64_PA_BITS_48
 	bool "48-bit"
+	depends on ARM64_64K_PAGES || !ARM64_VA_BITS_52
 
 config ARM64_PA_BITS_52
-	bool "52-bit (ARMv8.2)"
-	depends on ARM64_64K_PAGES
+	bool "52-bit"
+	depends on ARM64_64K_PAGES || ARM64_VA_BITS_52
 	depends on ARM64_PAN || !ARM64_SW_TTBR0_PAN
 	help
 	  Enable support for a 52-bit physical address space, introduced as
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 94f035f6c421..0be9296e9253 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2703,15 +2703,29 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	},
 #ifdef CONFIG_ARM64_VA_BITS_52
 	{
-		.desc = "52-bit Virtual Addressing (LVA)",
 		.capability = ARM64_HAS_VA52,
 		.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
-		.sys_reg = SYS_ID_AA64MMFR2_EL1,
-		.sign = FTR_UNSIGNED,
-		.field_width = 4,
-		.field_pos = ID_AA64MMFR2_EL1_VARange_SHIFT,
 		.matches = has_cpuid_feature,
+		.field_width = 4,
+#ifdef CONFIG_ARM64_64K_PAGES
+		.desc = "52-bit Virtual Addressing (LVA)",
+		.sign = FTR_SIGNED,
+		.sys_reg = SYS_ID_AA64MMFR2_EL1,
+		.field_pos = ID_AA64MMFR2_EL1_VARange_SHIFT,
 		.min_field_value = ID_AA64MMFR2_EL1_VARange_52,
+#else
+		.desc = "52-bit Virtual Addressing (LPA2)",
+		.sys_reg = SYS_ID_AA64MMFR0_EL1,
+#ifdef CONFIG_ARM64_4K_PAGES
+		.sign = FTR_SIGNED,
+		.field_pos = ID_AA64MMFR0_EL1_TGRAN4_SHIFT,
+		.min_field_value = ID_AA64MMFR0_EL1_TGRAN4_52_BIT,
+#else
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64MMFR0_EL1_TGRAN16_SHIFT,
+		.min_field_value = ID_AA64MMFR0_EL1_TGRAN16_52_BIT,
+#endif
+#endif
 	},
 #endif
 	{},

From 5d101654226d64ac0a6928019fbf476b46e9d14b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:27 +0100
Subject: [PATCH 074/134] arm64: defconfig: Enable LPA2 support

We typically enable support in defconfig for all architectural features
for which we can detect at runtime if the hardware actually supports
them.

Now that we have implemented support for LPA2 based 52-bit virtual
addressing in a way that should not impact 48-bit operation on non-LPA2
CPU, we can do the same, and enable 52-bit virtual addressing by
default.

Catalin adds:

  Currently the "Virtual address space size" arch/arm64/Kconfig menu
  entry sets different defaults for each page size. However, all are
  overridden by the defconfig to 48 bits. Set the new default in
  Kconfig and remove the defconfig line.

[ardb: squash follow-up fix from Catalin]

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-86-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig           | 4 +---
 arch/arm64/configs/defconfig | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eed8fef08a10..160856de9bbb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1283,9 +1283,7 @@ endchoice
 
 choice
 	prompt "Virtual address space size"
-	default ARM64_VA_BITS_39 if ARM64_4K_PAGES
-	default ARM64_VA_BITS_47 if ARM64_16K_PAGES
-	default ARM64_VA_BITS_42 if ARM64_64K_PAGES
+	default ARM64_VA_BITS_52
 	help
 	  Allows choosing one of multiple possible virtual address
 	  space sizes. The level of translation table is determined by
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index e6cf3e5d63c3..f086b0624ec8 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -76,7 +76,6 @@ CONFIG_ARCH_VEXPRESS=y
 CONFIG_ARCH_VISCONTI=y
 CONFIG_ARCH_XGENE=y
 CONFIG_ARCH_ZYNQMP=y
-CONFIG_ARM64_VA_BITS_48=y
 CONFIG_SCHED_MC=y
 CONFIG_SCHED_SMT=y
 CONFIG_NUMA=y

From cb1a393c40eee2f1692c995ea0cc6e45bfccde4d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:28 +0100
Subject: [PATCH 075/134] mm: add arch hook to validate mmap() prot flags

Add a hook to permit architectures to perform validation on the prot
flags passed to mmap(), like arch_validate_prot() does for mprotect().
This will be used by arm64 to reject PROT_WRITE+PROT_EXEC mappings on
configurations that run with WXN enabled.

Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-87-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mman.h | 15 +++++++++++++++
 mm/mmap.c            |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index dc7048824be8..ec5e7f606e43 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -124,6 +124,21 @@ static inline bool arch_validate_flags(unsigned long flags)
 #define arch_validate_flags arch_validate_flags
 #endif
 
+#ifndef arch_validate_mmap_prot
+/*
+ * This is called from mmap(), which ignores unknown prot bits so the default
+ * is to accept anything.
+ *
+ * Returns true if the prot flags are valid
+ */
+static inline bool arch_validate_mmap_prot(unsigned long prot,
+					   unsigned long addr)
+{
+	return true;
+}
+#define arch_validate_mmap_prot arch_validate_mmap_prot
+#endif
+
 /*
  * Optimisation macro.  It is equivalent to:
  *      (x & bit1) ? bit2 : 0
diff --git a/mm/mmap.c b/mm/mmap.c
index d89770eaab6b..977a8c3fd9f5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1229,6 +1229,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 		if (!(file && path_noexec(&file->f_path)))
 			prot |= PROT_EXEC;
 
+	if (!arch_validate_mmap_prot(prot, addr))
+		return -EACCES;
+
 	/* force arch specific MAP_FIXED handling in get_unmapped_area */
 	if (flags & MAP_FIXED_NOREPLACE)
 		flags |= MAP_FIXED;

From 50e3ed0f93f4f62ed2aa83de5db6cb84ecdd5707 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 14 Feb 2024 13:29:29 +0100
Subject: [PATCH 076/134] arm64: mm: add support for WXN memory translation
 attribute

The AArch64 virtual memory system supports a global WXN control, which
can be enabled to make all writable mappings implicitly no-exec. This is
a useful hardening feature, as it prevents mistakes in managing page
table permissions from being exploited to attack the system.

When enabled at EL1, the restrictions apply to both EL1 and EL0. EL1 is
completely under our control, and has been cleaned up to allow WXN to be
enabled from boot onwards. EL0 is not under our control, but given that
widely deployed security features such as selinux or PaX already limit
the ability of user space to create mappings that are writable and
executable at the same time, the impact of enabling this for EL0 is
expected to be limited. (For this reason, common user space libraries
that have a legitimate need for manipulating executable code already
carry fallbacks such as [0].)

If enabled at compile time, the feature can still be disabled at boot if
needed, by passing arm64.nowxn on the kernel command line.

[0] https://github.com/libffi/libffi/blob/master/src/closures.c#L440

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20240214122845.2033971-88-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig                    | 11 ++++++++
 arch/arm64/include/asm/cpufeature.h   |  8 ++++++
 arch/arm64/include/asm/mman.h         | 36 +++++++++++++++++++++++++++
 arch/arm64/include/asm/mmu_context.h  | 30 +++++++++++++++++++++-
 arch/arm64/kernel/pi/idreg-override.c |  4 ++-
 arch/arm64/kernel/pi/map_kernel.c     | 23 +++++++++++++++++
 arch/arm64/mm/proc.S                  |  6 +++++
 7 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 160856de9bbb..7761ffc6dbcf 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1608,6 +1608,17 @@ config RODATA_FULL_DEFAULT_ENABLED
 	  This requires the linear region to be mapped down to pages,
 	  which may adversely affect performance in some cases.
 
+config ARM64_WXN
+	bool "Enable WXN attribute so all writable mappings are non-exec"
+	help
+	  Set the WXN bit in the SCTLR system register so that all writable
+	  mappings are treated as if the PXN/UXN bit is set as well.
+	  If this is set to Y, it can still be disabled at runtime by
+	  passing 'arm64.nowxn' on the kernel command line.
+
+	  This should only be set if no software needs to be supported that
+	  relies on being able to execute from writable mappings.
+
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
 	help
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index a8f97690ce1f..ee33b7e52da7 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -18,6 +18,7 @@
 #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR	0
 #define ARM64_SW_FEATURE_OVERRIDE_HVHE		4
 #define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF	8
+#define ARM64_SW_FEATURE_OVERRIDE_NOWXN		12
 
 #ifndef __ASSEMBLY__
 
@@ -962,6 +963,13 @@ static inline bool kaslr_disabled_cmdline(void)
 	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR);
 }
 
+static inline bool arm64_wxn_enabled(void)
+{
+	if (!IS_ENABLED(CONFIG_ARM64_WXN))
+		return false;
+	return !arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOWXN);
+}
+
 u32 get_kvm_ipa_limit(void);
 void dump_cpu_features(void);
 
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 5966ee4a6154..6d4940342ba7 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -35,11 +35,40 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
 }
 #define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
 
+static inline bool arm64_check_wx_prot(unsigned long prot,
+				       struct task_struct *tsk)
+{
+	/*
+	 * When we are running with SCTLR_ELx.WXN==1, writable mappings are
+	 * implicitly non-executable. This means we should reject such mappings
+	 * when user space attempts to create them using mmap() or mprotect().
+	 */
+	if (arm64_wxn_enabled() &&
+	    ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))) {
+		/*
+		 * User space libraries such as libffi carry elaborate
+		 * heuristics to decide whether it is worth it to even attempt
+		 * to create writable executable mappings, as PaX or selinux
+		 * enabled systems will outright reject it. They will usually
+		 * fall back to something else (e.g., two separate shared
+		 * mmap()s of a temporary file) on failure.
+		 */
+		pr_info_ratelimited(
+			"process %s (%d) attempted to create PROT_WRITE+PROT_EXEC mapping\n",
+			tsk->comm, tsk->pid);
+		return false;
+	}
+	return true;
+}
+
 static inline bool arch_validate_prot(unsigned long prot,
 	unsigned long addr __always_unused)
 {
 	unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM;
 
+	if (!arm64_check_wx_prot(prot, current))
+		return false;
+
 	if (system_supports_bti())
 		supported |= PROT_BTI;
 
@@ -50,6 +79,13 @@ static inline bool arch_validate_prot(unsigned long prot,
 }
 #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
 
+static inline bool arch_validate_mmap_prot(unsigned long prot,
+					   unsigned long addr)
+{
+	return arm64_check_wx_prot(prot, current);
+}
+#define arch_validate_mmap_prot arch_validate_mmap_prot
+
 static inline bool arch_validate_flags(unsigned long vm_flags)
 {
 	if (!system_supports_mte())
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index c768d16b81a4..f0fe2d09d139 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -20,13 +20,41 @@
 #include <asm/cpufeature.h>
 #include <asm/daifflags.h>
 #include <asm/proc-fns.h>
-#include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
 #include <asm/sysreg.h>
 #include <asm/tlbflush.h>
 
 extern bool rodata_full;
 
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+			unsigned long start, unsigned long end)
+{
+}
+
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+		bool write, bool execute, bool foreign)
+{
+	if (IS_ENABLED(CONFIG_ARM64_WXN) && execute &&
+	    (vma->vm_flags & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC)) {
+		pr_warn_ratelimited(
+			"process %s (%d) attempted to execute from writable memory\n",
+			current->comm, current->pid);
+		/* disallow unless the nowxn override is set */
+		return !arm64_wxn_enabled();
+	}
+	return true;
+}
+
 static inline void contextidr_thread_switch(struct task_struct *next)
 {
 	if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index aad399796e81..bccfee34f62f 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -189,6 +189,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = {
 		FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL),
 		FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter),
 		FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL),
+		FIELD("nowxn", ARM64_SW_FEATURE_OVERRIDE_NOWXN, NULL),
 		{}
 	},
 };
@@ -221,8 +222,9 @@ static const struct {
 	{ "arm64.nomops",		"id_aa64isar2.mops=0" },
 	{ "arm64.nomte",		"id_aa64pfr1.mte=0" },
 	{ "nokaslr",			"arm64_sw.nokaslr=1" },
-	{ "rodata=off",			"arm64_sw.rodataoff=1" },
+	{ "rodata=off",			"arm64_sw.rodataoff=1 arm64_sw.nowxn=1" },
 	{ "arm64.nolva",		"id_aa64mmfr2.varange=0" },
+	{ "arm64.nowxn",		"arm64_sw.nowxn=1" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index 5fa08e13e17e..cac1e1f63c44 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -132,6 +132,25 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 	idmap_cpu_replace_ttbr1(swapper_pg_dir);
 }
 
+static void noinline __section(".idmap.text") disable_wxn(void)
+{
+	u64 sctlr = read_sysreg(sctlr_el1) & ~SCTLR_ELx_WXN;
+
+	/*
+	 * We cannot safely clear the WXN bit while the MMU and caches are on,
+	 * so turn the MMU off, flush the TLBs and turn it on again but with
+	 * the WXN bit cleared this time.
+	 */
+	asm("	msr	sctlr_el1, %0		;"
+	    "	isb				;"
+	    "	tlbi    vmalle1			;"
+	    "	dsb     nsh			;"
+	    "	isb				;"
+	    "	msr     sctlr_el1, %1		;"
+	    "	isb				;"
+	    ::	"r"(sctlr & ~SCTLR_ELx_M), "r"(sctlr));
+}
+
 static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr)
 {
 	u64 sctlr = read_sysreg(sctlr_el1);
@@ -229,6 +248,10 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	if (va_bits > VA_BITS_MIN)
 		sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits));
 
+	if (IS_ENABLED(CONFIG_ARM64_WXN) &&
+	    arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOWXN))
+		disable_wxn();
+
 	/*
 	 * The virtual KASLR displacement modulo 2MiB is decided by the
 	 * physical placement of the image, as otherwise, we might not be able
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 9d40f3ffd8d2..bfd2ad896108 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -546,6 +546,12 @@ alternative_else_nop_endif
 	 * Prepare SCTLR
 	 */
 	mov_q	x0, INIT_SCTLR_EL1_MMU_ON
+#ifdef CONFIG_ARM64_WXN
+	ldr_l	x1, arm64_sw_feature_override + FTR_OVR_VAL_OFFSET
+	tst	x1, #0xf << ARM64_SW_FEATURE_OVERRIDE_NOWXN
+	orr	x1, x0, #SCTLR_ELx_WXN
+	csel	x0, x0, x1, ne
+#endif
 	ret					// return to head.S
 
 	.unreq	mair

From 90e636f60b76c590aded72964543945084d97c2f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sat, 17 Feb 2024 00:59:44 +0100
Subject: [PATCH 077/134] arm64: mm: Make PUD folding check in set_pud() a
 runtime check

When set_pud() is called on a 4-level paging build config that runs with
3 levels at runtime (which happens with 16k page size builds with
support for LPA2), the updated entry is in fact a PGD in
swapper_pg_dir[], and this is mapped read-only after boot.

So in this case, the existing check needs to be performed as well, even
though __PAGETABLE_PUD_FOLDED is not #define'd. So replace the #ifdef
with a call to pgtable_l4_enabled().

Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240216235944.3677178-2-ardb+git@google.com
Reviewed-by: Itaru Kitayama <itaru.kitayama@fujitsu.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3c716fa8121..8bec85350865 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -699,14 +699,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 #define pud_user(pud)		pte_user(pud_pte(pud))
 #define pud_user_exec(pud)	pte_user_exec(pud_pte(pud))
 
+static inline bool pgtable_l4_enabled(void);
+
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
-#ifdef __PAGETABLE_PUD_FOLDED
-	if (in_swapper_pgdir(pudp)) {
+	if (!pgtable_l4_enabled() && in_swapper_pgdir(pudp)) {
 		set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud)));
 		return;
 	}
-#endif /* __PAGETABLE_PUD_FOLDED */
 
 	WRITE_ONCE(*pudp, pud);
 

From 2aea7b77aabc708a9df769ad5fa63e9912ceb7f7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 19 Feb 2024 15:13:22 +0000
Subject: [PATCH 078/134] arm64: Use Signed/Unsigned enums for TGRAN{4,16,64}
 and VARange

Open-coding the feature matching parameters for LVA/LVA2 leads to
issues with upcoming changes to the cpufeature code.

By making TGRAN{4,16,64} and VARange signed/unsigned as per the
architecture, we can use the existing macros, making the feature
match robust against those changes.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 15 +++------------
 arch/arm64/tools/sysreg        |  8 ++++----
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 0be9296e9253..d380ae783b73 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2706,24 +2706,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.capability = ARM64_HAS_VA52,
 		.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
 		.matches = has_cpuid_feature,
-		.field_width = 4,
 #ifdef CONFIG_ARM64_64K_PAGES
 		.desc = "52-bit Virtual Addressing (LVA)",
-		.sign = FTR_SIGNED,
-		.sys_reg = SYS_ID_AA64MMFR2_EL1,
-		.field_pos = ID_AA64MMFR2_EL1_VARange_SHIFT,
-		.min_field_value = ID_AA64MMFR2_EL1_VARange_52,
+		ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52)
 #else
 		.desc = "52-bit Virtual Addressing (LPA2)",
-		.sys_reg = SYS_ID_AA64MMFR0_EL1,
 #ifdef CONFIG_ARM64_4K_PAGES
-		.sign = FTR_SIGNED,
-		.field_pos = ID_AA64MMFR0_EL1_TGRAN4_SHIFT,
-		.min_field_value = ID_AA64MMFR0_EL1_TGRAN4_52_BIT,
+		ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)
 #else
-		.sign = FTR_UNSIGNED,
-		.field_pos = ID_AA64MMFR0_EL1_TGRAN16_SHIFT,
-		.min_field_value = ID_AA64MMFR0_EL1_TGRAN16_52_BIT,
+		ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)
 #endif
 #endif
 	},
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 4c9b67934367..f654e82ef072 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1540,16 +1540,16 @@ Enum	35:32	TGRAN16_2
 	0b0010	IMP
 	0b0011	52_BIT
 EndEnum
-Enum	31:28	TGRAN4
+SignedEnum	31:28	TGRAN4
 	0b0000	IMP
 	0b0001	52_BIT
 	0b1111	NI
 EndEnum
-Enum	27:24	TGRAN64
+SignedEnum	27:24	TGRAN64
 	0b0000	IMP
 	0b1111	NI
 EndEnum
-Enum	23:20	TGRAN16
+UnsignedEnum	23:20	TGRAN16
 	0b0000	NI
 	0b0001	IMP
 	0b0010	52_BIT
@@ -1697,7 +1697,7 @@ Enum	23:20	CCIDX
 	0b0000	32
 	0b0001	64
 EndEnum
-Enum	19:16	VARange
+UnsignedEnum	19:16	VARange
 	0b0000	48
 	0b0001	52
 EndEnum

From 8c10cc104b73abdfd87a23ae50b1c90c2c917027 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 9 Feb 2024 18:39:16 +0000
Subject: [PATCH 079/134] arm64: errata: Don't enable workarounds for "rare"
 errata by default

Arm classifies some of its CPU errata as "rare", indicating that the
hardware error is unlikely to occur in practice. Given that the cost of
errata workarounds can often be significant in terms of power and
performance, don't enable workarounds for "rare" errata by default and
update our documentation to reflect that.

Cc: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240209183916.25860-1-will@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/silicon-errata.rst |  5 +++--
 arch/arm64/Kconfig                          | 25 +++++++++------------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index e8c2ce1f9df6..fa16b895c997 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -35,8 +35,9 @@ can be triggered by Linux).
 For software workarounds that may adversely impact systems unaffected by
 the erratum in question, a Kconfig entry is added under "Kernel
 Features" -> "ARM errata workarounds via the alternatives framework".
-These are enabled by default and patched in at runtime when an affected
-CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+With the exception of workarounds for errata deemed "rare" by Arm, these
+are enabled by default and patched in at runtime when an affected CPU is
+detected. For less-intrusive workarounds, a Kconfig option is not
 available and the code is structured (preferably with a comment) in such
 a way that the erratum will not be hit.
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d435139..430fabf20f17 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -547,9 +547,8 @@ config ARM64_ERRATUM_832075
 	  If unsure, say Y.
 
 config ARM64_ERRATUM_834220
-	bool "Cortex-A57: 834220: Stage 2 translation fault might be incorrectly reported in presence of a Stage 1 fault"
+	bool "Cortex-A57: 834220: Stage 2 translation fault might be incorrectly reported in presence of a Stage 1 fault (rare)"
 	depends on KVM
-	default y
 	help
 	  This option adds an alternative code sequence to work around ARM
 	  erratum 834220 on Cortex-A57 parts up to r1p2.
@@ -565,7 +564,7 @@ config ARM64_ERRATUM_834220
 	  as it depends on the alternative framework, which will only patch
 	  the kernel if an affected CPU is detected.
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config ARM64_ERRATUM_1742098
 	bool "Cortex-A57/A72: 1742098: ELR recorded incorrectly on interrupt taken between cryptographic instructions in a sequence"
@@ -692,8 +691,7 @@ config ARM64_WORKAROUND_REPEAT_TLBI
 	bool
 
 config ARM64_ERRATUM_2441007
-	bool "Cortex-A55: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
-	default y
+	bool "Cortex-A55: Completion of affected memory accesses might not be guaranteed by completion of a TLBI (rare)"
 	select ARM64_WORKAROUND_REPEAT_TLBI
 	help
 	  This option adds a workaround for ARM Cortex-A55 erratum #2441007.
@@ -706,11 +704,10 @@ config ARM64_ERRATUM_2441007
 	  Work around this by adding the affected CPUs to the list that needs
 	  TLB sequences to be done twice.
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config ARM64_ERRATUM_1286807
-	bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation"
-	default y
+	bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation (rare)"
 	select ARM64_WORKAROUND_REPEAT_TLBI
 	help
 	  This option adds a workaround for ARM Cortex-A76 erratum 1286807.
@@ -724,6 +721,8 @@ config ARM64_ERRATUM_1286807
 	  invalidated has been observed by other observers. The
 	  workaround repeats the TLBI+DSB operation.
 
+	  If unsure, say N.
+
 config ARM64_ERRATUM_1463225
 	bool "Cortex-A76: Software Step might prevent interrupt recognition"
 	default y
@@ -743,8 +742,7 @@ config ARM64_ERRATUM_1463225
 	  If unsure, say Y.
 
 config ARM64_ERRATUM_1542419
-	bool "Neoverse-N1: workaround mis-ordering of instruction fetches"
-	default y
+	bool "Neoverse-N1: workaround mis-ordering of instruction fetches (rare)"
 	help
 	  This option adds a workaround for ARM Neoverse-N1 erratum
 	  1542419.
@@ -756,7 +754,7 @@ config ARM64_ERRATUM_1542419
 	  Workaround the issue by hiding the DIC feature from EL0. This
 	  forces user-space to perform cache maintenance.
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config ARM64_ERRATUM_1508412
 	bool "Cortex-A77: 1508412: workaround deadlock on sequence of NC/Device load and store exclusive or PAR read"
@@ -931,8 +929,7 @@ config ARM64_ERRATUM_2224489
 	  If unsure, say Y.
 
 config ARM64_ERRATUM_2441009
-	bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
-	default y
+	bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI (rare)"
 	select ARM64_WORKAROUND_REPEAT_TLBI
 	help
 	  This option adds a workaround for ARM Cortex-A510 erratum #2441009.
@@ -945,7 +942,7 @@ config ARM64_ERRATUM_2441009
 	  Work around this by adding the affected CPUs to the list that needs
 	  TLB sequences to be done twice.
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config ARM64_ERRATUM_2064142
 	bool "Cortex-A510: 2064142: workaround TRBE register writes while disabled"

From d044d6ba6f028e32a4ecd6b681c6aa0c48ddee18 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 24 Jan 2024 11:12:59 +0000
Subject: [PATCH 080/134] arm64: io: permit offset addressing

Currently our IO accessors all use register addressing without offsets,
but we could safely use offset addressing (without writeback) to
simplify and optimize the generated code.

To function correctly under a hypervisor which emulates IO accesses, we
must ensure that any faulting/trapped IO access results in an ESR_ELx
value with ESR_ELX.ISS.ISV=1 and with the tranfer register described in
ESR_ELx.ISS.SRT. This means that we can only use loads/stores of a
single general purpose register (or the zero register), and must avoid
writeback addressing modes. However, we can use immediate offset
addressing modes, as these still provide ESR_ELX.ISS.ISV=1 and a valid
ESR_ELx.ISS.SRT when those accesses fault at Stage-2.

Currently we only use register addressing without offsets. We use the
"r" constraint to place the address into a register, and manually
generate the register addressing by surrounding the resulting register
operand with square braces, e.g.

| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
|         asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
| }

Due to this, sequences of adjacent accesses need to generate addresses
using separate instructions. For example, the following code:

| void writeq_zero_8_times(void *ptr)
| {
|        writeq_relaxed(0, ptr + 8 * 0);
|        writeq_relaxed(0, ptr + 8 * 1);
|        writeq_relaxed(0, ptr + 8 * 2);
|        writeq_relaxed(0, ptr + 8 * 3);
|        writeq_relaxed(0, ptr + 8 * 4);
|        writeq_relaxed(0, ptr + 8 * 5);
|        writeq_relaxed(0, ptr + 8 * 6);
|        writeq_relaxed(0, ptr + 8 * 7);
| }

... is compiled to:

| <writeq_zero_8_times>:
|     str     xzr, [x0]
|     add     x1, x0, #0x8
|     str     xzr, [x1]
|     add     x1, x0, #0x10
|     str     xzr, [x1]
|     add     x1, x0, #0x18
|     str     xzr, [x1]
|     add     x1, x0, #0x20
|     str     xzr, [x1]
|     add     x1, x0, #0x28
|     str     xzr, [x1]
|     add     x1, x0, #0x30
|     str     xzr, [x1]
|     add     x0, x0, #0x38
|     str     xzr, [x0]
|     ret

As described above, we could safely use immediate offset addressing,
which would allow the ADDs to be folded into the address generation for
the STRs, resulting in simpler and smaller generated assembly. We can do
this by using the "o" constraint to allow the compiler to generate
offset addressing (without writeback) for a memory operand, e.g.

| static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
| {
|         volatile u64 __iomem *ptr = addr;
|         asm volatile("str %x0, %1" : : "rZ" (val), "o" (*ptr));
| }

... which results in the earlier code sequence being compiled to:

| <writeq_zero_8_times>:
|     str     xzr, [x0]
|     str     xzr, [x0, #8]
|     str     xzr, [x0, #16]
|     str     xzr, [x0, #24]
|     str     xzr, [x0, #32]
|     str     xzr, [x0, #40]
|     str     xzr, [x0, #48]
|     str     xzr, [x0, #56]
|     ret

As Will notes at:

  https://lore.kernel.org/linux-arm-kernel/20240117160528.GA3398@willie-the-truck/

... some compilers struggle with a plain "o" constraint, so it's
preferable to use "Qo", where the additional "Q" constraint permits
using non-offset register addressing.

This patch modifies our IO write accessors to use "Qo" constraints,
resulting in the better code generation described above. The IO read
accessors are left as-is because ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE
requires that non-offset register addressing is used, as the LDAR
instruction does not support offset addressing.

When compiling v6.8-rc1 defconfig with GCC 13.2.0, this saves ~4KiB of
text:

| [mark@lakrids:~/src/linux]% ls -al vmlinux-*
| -rwxr-xr-x 1 mark mark 153960576 Jan 23 12:01 vmlinux-after
| -rwxr-xr-x 1 mark mark 153862192 Jan 23 11:57 vmlinux-before
|
| [mark@lakrids:~/src/linux]% size vmlinux-before vmlinux-after
|    text    data     bss     dec     hex filename
| 26708921        16690350         622736 44022007        29fb8f7 vmlinux-before
| 26704761        16690414         622736 44017911        29fa8f7 vmlinux-after

... though due to internal alignment of sections, this has no impact on
the size of the resulting Image:

| [mark@lakrids:~/src/linux]% ls -al Image-*
| -rw-r--r-- 1 mark mark 43590144 Jan 23 12:01 Image-after
| -rw-r--r-- 1 mark mark 43590144 Jan 23 11:57 Image-before

Aside from the better code generation, there should be no functional
change as a result of this patch. I have lightly tested this patch,
including booting under KVM (where some devices such as PL011 are
emulated).

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20240124111259.874975-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/io.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 3b694511b98f..8d825522c55c 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -24,25 +24,29 @@
 #define __raw_writeb __raw_writeb
 static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr)
 {
-	asm volatile("strb %w0, [%1]" : : "rZ" (val), "r" (addr));
+	volatile u8 __iomem *ptr = addr;
+	asm volatile("strb %w0, %1" : : "rZ" (val), "Qo" (*ptr));
 }
 
 #define __raw_writew __raw_writew
 static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr)
 {
-	asm volatile("strh %w0, [%1]" : : "rZ" (val), "r" (addr));
+	volatile u16 __iomem *ptr = addr;
+	asm volatile("strh %w0, %1" : : "rZ" (val), "Qo" (*ptr));
 }
 
 #define __raw_writel __raw_writel
 static __always_inline void __raw_writel(u32 val, volatile void __iomem *addr)
 {
-	asm volatile("str %w0, [%1]" : : "rZ" (val), "r" (addr));
+	volatile u32 __iomem *ptr = addr;
+	asm volatile("str %w0, %1" : : "rZ" (val), "Qo" (*ptr));
 }
 
 #define __raw_writeq __raw_writeq
 static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
 {
-	asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
+	volatile u64 __iomem *ptr = addr;
+	asm volatile("str %x0, %1" : : "rZ" (val), "Qo" (*ptr));
 }
 
 #define __raw_readb __raw_readb

From 270de609ae2af441d15289406340ff209e5dc864 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 6 Feb 2024 12:38:46 +0000
Subject: [PATCH 081/134] arm64: Simplify do_notify_resume() DAIF masking

In do_notify_resume, we handle _TIF_NEED_RESCHED differently from all
other flags, leaving IRQ+FIQ masked when calling into schedule(). This
masking is a historical artifact, and it is not currently necessary
to mask IRQ+FIQ when calling into schedule (as evidenced by the generic
exit_to_user_mode_loop(), which unmasks IRQs before checking
_TIF_NEED_RESCHED and calling schedule()).

This patch removes the special case for _TIF_NEED_RESCHED, moving this
check into the main loop such that schedule() will be called from a
regular process context with IRQ+FIQ unmasked. This is a minor
simplification to do_notify_resume() and brings it into line with the
generic exit_to_user_mode_loop() logic. This will also aid subsequent
rework of DAIF management.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240206123848.1696480-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Itaru Kitayama <itaru.kitayama@linux.dev>
---
 arch/arm64/kernel/signal.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 0e8beb3349ea..50e108741599 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -1281,33 +1281,29 @@ static void do_signal(struct pt_regs *regs)
 void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 {
 	do {
-		if (thread_flags & _TIF_NEED_RESCHED) {
-			/* Unmask Debug and SError for the next task */
-			local_daif_restore(DAIF_PROCCTX_NOIRQ);
+		local_daif_restore(DAIF_PROCCTX);
 
+		if (thread_flags & _TIF_NEED_RESCHED)
 			schedule();
-		} else {
-			local_daif_restore(DAIF_PROCCTX);
 
-			if (thread_flags & _TIF_UPROBE)
-				uprobe_notify_resume(regs);
+		if (thread_flags & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
 
-			if (thread_flags & _TIF_MTE_ASYNC_FAULT) {
-				clear_thread_flag(TIF_MTE_ASYNC_FAULT);
-				send_sig_fault(SIGSEGV, SEGV_MTEAERR,
-					       (void __user *)NULL, current);
-			}
-
-			if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-				do_signal(regs);
-
-			if (thread_flags & _TIF_NOTIFY_RESUME)
-				resume_user_mode_work(regs);
-
-			if (thread_flags & _TIF_FOREIGN_FPSTATE)
-				fpsimd_restore_current_state();
+		if (thread_flags & _TIF_MTE_ASYNC_FAULT) {
+			clear_thread_flag(TIF_MTE_ASYNC_FAULT);
+			send_sig_fault(SIGSEGV, SEGV_MTEAERR,
+				       (void __user *)NULL, current);
 		}
 
+		if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+			do_signal(regs);
+
+		if (thread_flags & _TIF_NOTIFY_RESUME)
+			resume_user_mode_work(regs);
+
+		if (thread_flags & _TIF_FOREIGN_FPSTATE)
+			fpsimd_restore_current_state();
+
 		local_daif_mask();
 		thread_flags = read_thread_flags();
 	} while (thread_flags & _TIF_WORK_MASK);

From 997d79eb938e981ab0d3714d39ed148bce131d9e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 6 Feb 2024 12:38:47 +0000
Subject: [PATCH 082/134] arm64: Move do_notify_resume() to entry-common.c

Currently do_notify_resume() lives in arch/arm64/kernel/signal.c, but it would
make more sense for it to live in entry-common.c as it handles more than
signals, and is coupled with the rest of the return-to-userspace sequence (e.g.
with unusual DAIF masking that matches the exception return requirements).

Move do_notify_resume() to entry-common.c.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240206123848.1696480-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Itaru Kitayama <itaru.kitayama@linux.dev>
---
 arch/arm64/include/asm/exception.h |  2 +-
 arch/arm64/kernel/entry-common.c   | 32 +++++++++++++++++++++++++++
 arch/arm64/kernel/signal.c         | 35 ++----------------------------
 3 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index ad688e157c9b..f296662590c7 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -74,7 +74,7 @@ void do_el0_fpac(struct pt_regs *regs, unsigned long esr);
 void do_el1_fpac(struct pt_regs *regs, unsigned long esr);
 void do_el0_mops(struct pt_regs *regs, unsigned long esr);
 void do_serror(struct pt_regs *regs, unsigned long esr);
-void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags);
+void do_signal(struct pt_regs *regs);
 
 void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far);
 #endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 0fc94207e69a..3c849ad03bf8 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -10,6 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
 #include <linux/ptrace.h>
+#include <linux/resume_user_mode.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/thread_info.h>
@@ -126,6 +127,37 @@ static __always_inline void __exit_to_user_mode(void)
 	lockdep_hardirqs_on(CALLER_ADDR0);
 }
 
+static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
+{
+	do {
+		local_daif_restore(DAIF_PROCCTX);
+
+		if (thread_flags & _TIF_NEED_RESCHED)
+			schedule();
+
+		if (thread_flags & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
+
+		if (thread_flags & _TIF_MTE_ASYNC_FAULT) {
+			clear_thread_flag(TIF_MTE_ASYNC_FAULT);
+			send_sig_fault(SIGSEGV, SEGV_MTEAERR,
+				       (void __user *)NULL, current);
+		}
+
+		if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+			do_signal(regs);
+
+		if (thread_flags & _TIF_NOTIFY_RESUME)
+			resume_user_mode_work(regs);
+
+		if (thread_flags & _TIF_FOREIGN_FPSTATE)
+			fpsimd_restore_current_state();
+
+		local_daif_mask();
+		thread_flags = read_thread_flags();
+	} while (thread_flags & _TIF_WORK_MASK);
+}
+
 static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long flags;
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 50e108741599..c08e6465e0f4 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -16,8 +16,8 @@
 #include <linux/uaccess.h>
 #include <linux/sizes.h>
 #include <linux/string.h>
-#include <linux/resume_user_mode.h>
 #include <linux/ratelimit.h>
+#include <linux/rseq.h>
 #include <linux/syscalls.h>
 
 #include <asm/daifflags.h>
@@ -1207,7 +1207,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * the kernel can handle, and then we build all the user-level signal handling
  * stack-frames in one go after that.
  */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	unsigned long continue_addr = 0, restart_addr = 0;
 	int retval = 0;
@@ -1278,37 +1278,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
-void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
-{
-	do {
-		local_daif_restore(DAIF_PROCCTX);
-
-		if (thread_flags & _TIF_NEED_RESCHED)
-			schedule();
-
-		if (thread_flags & _TIF_UPROBE)
-			uprobe_notify_resume(regs);
-
-		if (thread_flags & _TIF_MTE_ASYNC_FAULT) {
-			clear_thread_flag(TIF_MTE_ASYNC_FAULT);
-			send_sig_fault(SIGSEGV, SEGV_MTEAERR,
-				       (void __user *)NULL, current);
-		}
-
-		if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			do_signal(regs);
-
-		if (thread_flags & _TIF_NOTIFY_RESUME)
-			resume_user_mode_work(regs);
-
-		if (thread_flags & _TIF_FOREIGN_FPSTATE)
-			fpsimd_restore_current_state();
-
-		local_daif_mask();
-		thread_flags = read_thread_flags();
-	} while (thread_flags & _TIF_WORK_MASK);
-}
-
 unsigned long __ro_after_init signal_minsigstksz;
 
 /*

From 97d935faacde478501eea6f75c86beea71f29ba3 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 6 Feb 2024 12:38:48 +0000
Subject: [PATCH 083/134] arm64: Unmask Debug + SError in do_notify_resume()

When returning to a user context, the arm64 entry code masks all DAIF
exceptions before handling pending work in exit_to_user_mode_prepare()
and do_notify_resume(), where it will transiently unmask all DAIF
exceptions. This is a holdover from the old entry assembly, which
conservatively masked all DAIF exceptions, and it's only necessary to
mask interrupts at this point during the exception return path, so long
as we subsequently mask all DAIF exceptions before the actual exception
return.

While most DAIF manipulation follows a save...restore sequence, the
manipulation in do_notify_resume() is the other way around, unmasking
all DAIF exceptions before masking them again. This is unfortunate as we
unnecessarily mask Debug and SError exceptions, and it would be nice to
remove this special case to make DAIF manipulation simpler and most
consistent.

This patch changes exit_to_user_mode_prepare() and do_notify_resume() to
only mask interrupts while handling pending work, masking other DAIF
exceptions after this has completed. This removes the unusual DAIF
manipulation and allows Debug and SError exceptions to be taken for a
slightly longer window during the exception return path.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240206123848.1696480-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Itaru Kitayama <itaru.kitayama@linux.dev>
---
 arch/arm64/kernel/entry-common.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3c849ad03bf8..b77a15955f28 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -130,7 +130,7 @@ static __always_inline void __exit_to_user_mode(void)
 static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 {
 	do {
-		local_daif_restore(DAIF_PROCCTX);
+		local_irq_enable();
 
 		if (thread_flags & _TIF_NEED_RESCHED)
 			schedule();
@@ -153,7 +153,7 @@ static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 		if (thread_flags & _TIF_FOREIGN_FPSTATE)
 			fpsimd_restore_current_state();
 
-		local_daif_mask();
+		local_irq_disable();
 		thread_flags = read_thread_flags();
 	} while (thread_flags & _TIF_WORK_MASK);
 }
@@ -162,12 +162,14 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long flags;
 
-	local_daif_mask();
+	local_irq_disable();
 
 	flags = read_thread_flags();
 	if (unlikely(flags & _TIF_WORK_MASK))
 		do_notify_resume(regs, flags);
 
+	local_daif_mask();
+
 	lockdep_sys_exit();
 }
 

From 253751233b19b58b1d361388c8d2b40c940e729c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 8 Feb 2024 14:59:16 +0000
Subject: [PATCH 084/134] arm64: kretprobes: acquire the regs via a BRK
 exception

On arm64, kprobes always take an exception and so create a struct
pt_regs through the usual exception entry logic. Similarly kretprobes
taskes and exception for function entry, but for function returns it
uses a trampoline which attempts to create a struct pt_regs without
taking an exception.

This is problematic for a few reasons, including:

1) The kretprobes trampoline neither saves nor restores all of the
   portions of PSTATE. Before invoking the handler it saves a number of
   portions of PSTATE, and after returning from the handler it restores
   NZCV before returning to the original return address provided by the
   handler.

2) The kretprobe trampoline constructs the PSTATE value piecemeal from
   special purpose registers as it cannot read all of PSTATE atomically
   without taking an exception. This is somewhat fragile, and it's not
   possible to reliably recover PSTATE information which only exists on
   some physical CPUs (e.g. when SSBS support is mismatched).

   Today the kretprobes trampoline does not record:

   - BTYPE
   - SSBS
   - ALLINT
   - SS
   - PAN
   - UAO
   - DIT
   - TCO

   ... and this will only get worse with future architecture extensions
   which add more PSTATE bits.

3) The kretprobes trampoline doesn't store portions of struct pt_regs
   (e.g. the PMR value when using pseudo-NMIs). Due to this, helpers
   which operate on a struct pt_regs, such as interrupts_enabled(), may
   not work correctly.

4) The function entry and function exit handlers run in different
   contexts. The entry handler will always be run in a debug exception
   context (which is currently treated as an NMI), but the return will
   be treated as whatever context the instrumented function was executed
   in. The differences between these contexts are liable to cause
   problems (e.g. as the two can be differently interruptible or
   preemptible, adversely affecting synchronization between the
   handlers).

5) As the kretprobes trampoline runs in the same context as the code
   being probed, it is subject to the same single-stepping context,
   which may not be desirable if this is being driven by the kprobes
   handlers.

Overall, this is fragile, painful to maintain, and gets in the way of
supporting other things (e.g. RELIABLE_STACKTRACE, FEAT_NMI).

This patch addresses these issues by replacing the kretprobes trampoline
with a `BRK` instruction, and using an exception boundary to acquire and
restore the regs, in the same way as the regular kprobes trampoline.

Ive tested this atop v6.8-rc3:

| KTAP version 1
| 1..1
|     KTAP version 1
|     # Subtest: kprobes_test
|     # module: test_kprobes
|     1..7
|     ok 1 test_kprobe
|     ok 2 test_kprobes
|     ok 3 test_kprobe_missed
|     ok 4 test_kretprobe
|     ok 5 test_kretprobes
|     ok 6 test_stacktrace_on_kretprobe
|     ok 7 test_stacktrace_on_nested_kretprobe
| # kprobes_test: pass:7 fail:0 skip:0 total:7
| # Totals: pass:7 fail:0 skip:0 total:7
| ok 1 kprobes_test

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Florent Revest <revest@chromium.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20240208145916.2004154-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/brk-imm.h              |  2 +
 arch/arm64/kernel/probes/kprobes.c            | 21 +++--
 arch/arm64/kernel/probes/kprobes_trampoline.S | 78 ++-----------------
 3 files changed, 24 insertions(+), 77 deletions(-)

diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h
index 1abdcd508a11..beb42c62b6ac 100644
--- a/arch/arm64/include/asm/brk-imm.h
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -11,6 +11,7 @@
  * 0x004: for installing kprobes
  * 0x005: for installing uprobes
  * 0x006: for kprobe software single-step
+ * 0x007: for kretprobe return
  * Allowed values for kgdb are 0x400 - 0x7ff
  * 0x100: for triggering a fault on purpose (reserved)
  * 0x400: for dynamic BRK instruction
@@ -23,6 +24,7 @@
 #define KPROBES_BRK_IMM			0x004
 #define UPROBES_BRK_IMM			0x005
 #define KPROBES_BRK_SS_IMM		0x006
+#define KRETPROBES_BRK_IMM		0x007
 #define FAULT_BRK_IMM			0x100
 #define KGDB_DYN_DBG_BRK_IMM		0x400
 #define KGDB_COMPILED_DBG_BRK_IMM	0x401
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 70b91a8c6bb3..327855a11df2 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -371,6 +371,21 @@ static struct break_hook kprobes_break_ss_hook = {
 	.fn = kprobe_breakpoint_ss_handler,
 };
 
+static int __kprobes
+kretprobe_breakpoint_handler(struct pt_regs *regs, unsigned long esr)
+{
+	if (regs->pc != (unsigned long)__kretprobe_trampoline)
+		return DBG_HOOK_ERROR;
+
+	regs->pc = kretprobe_trampoline_handler(regs, (void *)regs->regs[29]);
+	return DBG_HOOK_HANDLED;
+}
+
+static struct break_hook kretprobes_break_hook = {
+	.imm = KRETPROBES_BRK_IMM,
+	.fn = kretprobe_breakpoint_handler,
+};
+
 /*
  * Provide a blacklist of symbols identifying ranges which cannot be kprobed.
  * This blacklist is exposed to userspace via debugfs (kprobes/blacklist).
@@ -396,11 +411,6 @@ int __init arch_populate_kprobe_blacklist(void)
 	return ret;
 }
 
-void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
-{
-	return (void *)kretprobe_trampoline_handler(regs, (void *)regs->regs[29]);
-}
-
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
@@ -420,6 +430,7 @@ int __init arch_init_kprobes(void)
 {
 	register_kernel_break_hook(&kprobes_break_hook);
 	register_kernel_break_hook(&kprobes_break_ss_hook);
+	register_kernel_break_hook(&kretprobes_break_hook);
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S
index 9a6499bed58b..a362f3dbb3d1 100644
--- a/arch/arm64/kernel/probes/kprobes_trampoline.S
+++ b/arch/arm64/kernel/probes/kprobes_trampoline.S
@@ -4,83 +4,17 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/asm-offsets.h>
+#include <asm/asm-bug.h>
 #include <asm/assembler.h>
 
 	.text
 
-	.macro	save_all_base_regs
-	stp x0, x1, [sp, #S_X0]
-	stp x2, x3, [sp, #S_X2]
-	stp x4, x5, [sp, #S_X4]
-	stp x6, x7, [sp, #S_X6]
-	stp x8, x9, [sp, #S_X8]
-	stp x10, x11, [sp, #S_X10]
-	stp x12, x13, [sp, #S_X12]
-	stp x14, x15, [sp, #S_X14]
-	stp x16, x17, [sp, #S_X16]
-	stp x18, x19, [sp, #S_X18]
-	stp x20, x21, [sp, #S_X20]
-	stp x22, x23, [sp, #S_X22]
-	stp x24, x25, [sp, #S_X24]
-	stp x26, x27, [sp, #S_X26]
-	stp x28, x29, [sp, #S_X28]
-	add x0, sp, #PT_REGS_SIZE
-	stp lr, x0, [sp, #S_LR]
-	/*
-	 * Construct a useful saved PSTATE
-	 */
-	mrs x0, nzcv
-	mrs x1, daif
-	orr x0, x0, x1
-	mrs x1, CurrentEL
-	orr x0, x0, x1
-	mrs x1, SPSel
-	orr x0, x0, x1
-	stp xzr, x0, [sp, #S_PC]
-	.endm
-
-	.macro	restore_all_base_regs
-	ldr x0, [sp, #S_PSTATE]
-	and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT)
-	msr nzcv, x0
-	ldp x0, x1, [sp, #S_X0]
-	ldp x2, x3, [sp, #S_X2]
-	ldp x4, x5, [sp, #S_X4]
-	ldp x6, x7, [sp, #S_X6]
-	ldp x8, x9, [sp, #S_X8]
-	ldp x10, x11, [sp, #S_X10]
-	ldp x12, x13, [sp, #S_X12]
-	ldp x14, x15, [sp, #S_X14]
-	ldp x16, x17, [sp, #S_X16]
-	ldp x18, x19, [sp, #S_X18]
-	ldp x20, x21, [sp, #S_X20]
-	ldp x22, x23, [sp, #S_X22]
-	ldp x24, x25, [sp, #S_X24]
-	ldp x26, x27, [sp, #S_X26]
-	ldp x28, x29, [sp, #S_X28]
-	.endm
-
 SYM_CODE_START(__kretprobe_trampoline)
-	sub sp, sp, #PT_REGS_SIZE
-
-	save_all_base_regs
-
-	/* Setup a frame pointer. */
-	add x29, sp, #S_FP
-
-	mov x0, sp
-	bl trampoline_probe_handler
 	/*
-	 * Replace trampoline address in lr with actual orig_ret_addr return
-	 * address.
+	 * Trigger a breakpoint exception. The PC will be adjusted by
+	 * kretprobe_breakpoint_handler(), and no subsequent instructions will
+	 * be executed from the trampoline.
 	 */
-	mov lr, x0
-
-	/* The frame pointer (x29) is restored with other registers. */
-	restore_all_base_regs
-
-	add sp, sp, #PT_REGS_SIZE
-	ret
-
+	brk #KRETPROBES_BRK_IMM
+	ASM_BUG()
 SYM_CODE_END(__kretprobe_trampoline)

From c745b15c1f9cea5680c2906ae868302108f8daf0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 22 Jan 2024 21:05:03 +0000
Subject: [PATCH 085/134] kselftest/arm64: Test that ptrace takes effect in the
 target process

While we have test coverage for the ptrace interface in our selftests
the current programs have a number of gaps. The testing is done per
regset so does not cover interactions and at no point do any of the
tests actually run the traced processes meaning that there is no
validation that anything we read or write corresponds to register values
the process actually sees. Let's add a new program which attempts to cover
these gaps.

Each test we do performs a single ptrace write. For each test we generate
some random initial register data in memory and then fork() and trace a
child. The child will load the generated data into the registers then
trigger a breakpoint. The parent waits for the breakpoint then reads the
entire child register state via ptrace, verifying that the values expected
were actually loaded by the child. It then does the write being tested
and resumes the child. Once resumed the child saves the register state
it sees to memory and executes another breakpoint. The parent uses
process_vm_readv() to get these values from the child and verifies that
the values were as expected before cleaning up the child.

We generate configurations with combinations of vector lengths and SVCR
values and then try every ptrace write which will implement the
transition we generated. In order to control execution time (especially
in emulation) we only cover the minimum and maximum VL for each of SVE
and SME, this will ensure we generate both increasing and decreasing
changes in vector length. In order to provide a baseline test we also
check the case where we resume the child without doing a ptrace write.

In order to simplify the generation of the test count for kselftest we
will report but skip a substantial number of tests that can't actually
be expressed via a single ptrace write, several times more than we
actually run. This is noisy and will add some overhead but is very much
simpler so is probably worth the tradeoff.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240122-arm64-test-ptrace-regs-v1-1-0897f822d73e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/.gitignore   |    1 +
 tools/testing/selftests/arm64/fp/Makefile     |    5 +-
 .../selftests/arm64/fp/fp-ptrace-asm.S        |  279 +++
 tools/testing/selftests/arm64/fp/fp-ptrace.c  | 1503 +++++++++++++++++
 tools/testing/selftests/arm64/fp/fp-ptrace.h  |   13 +
 5 files changed, 1800 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
 create mode 100644 tools/testing/selftests/arm64/fp/fp-ptrace.c
 create mode 100644 tools/testing/selftests/arm64/fp/fp-ptrace.h

diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index ebc86757bdd8..00e52c966281 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -1,4 +1,5 @@
 fp-pidbench
+fp-ptrace
 fp-stress
 fpsimd-test
 rdvl-sme
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index b413b0af07f9..55d4f00d9e8e 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -5,7 +5,9 @@ top_srcdir = $(realpath ../../../../../)
 
 CFLAGS += $(KHDR_INCLUDES)
 
-TEST_GEN_PROGS := fp-stress \
+TEST_GEN_PROGS := \
+	fp-ptrace \
+	fp-stress \
 	sve-ptrace sve-probe-vls \
 	vec-syscfg \
 	za-fork za-ptrace
@@ -24,6 +26,7 @@ EXTRA_CLEAN += $(OUTPUT)/asm-utils.o $(OUTPUT)/rdvl.o $(OUTPUT)/za-fork-asm.o
 # Build with nolibc to avoid effects due to libc's clone() support
 $(OUTPUT)/fp-pidbench: fp-pidbench.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
+$(OUTPUT)/fp-ptrace: fp-ptrace.c fp-ptrace-asm.S
 $(OUTPUT)/fpsimd-test: fpsimd-test.S $(OUTPUT)/asm-utils.o
 	$(CC) -nostdlib $^ -o $@
 $(OUTPUT)/rdvl-sve: rdvl-sve.c $(OUTPUT)/rdvl.o
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
new file mode 100644
index 000000000000..7ad59d92d02b
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021-3 ARM Limited.
+//
+// Assembly portion of the FP ptrace test
+
+//
+// Load values from memory into registers, break on a breakpoint, then
+// break on a further breakpoint
+//
+
+#include "fp-ptrace.h"
+#include "sme-inst.h"
+
+.arch_extension sve
+
+// Load and save register values with pauses for ptrace
+//
+// x0 - SVE in use
+// x1 - SME in use
+// x2 - SME2 in use
+// x3 - FA64 supported
+
+.globl load_and_save
+load_and_save:
+	stp	x11, x12, [sp, #-0x10]!
+
+	// This should be redundant in the SVE case
+	ldr	x7, =v_in
+	ldp	q0, q1, [x7]
+	ldp	q2, q3, [x7, #16 * 2]
+	ldp	q4, q5, [x7, #16 * 4]
+	ldp	q6, q7, [x7, #16 * 6]
+	ldp	q8, q9, [x7, #16 * 8]
+	ldp	q10, q11, [x7, #16 * 10]
+	ldp	q12, q13, [x7, #16 * 12]
+	ldp	q14, q15, [x7, #16 * 14]
+	ldp	q16, q17, [x7, #16 * 16]
+	ldp	q18, q19, [x7, #16 * 18]
+	ldp	q20, q21, [x7, #16 * 20]
+	ldp	q22, q23, [x7, #16 * 22]
+	ldp	q24, q25, [x7, #16 * 24]
+	ldp	q26, q27, [x7, #16 * 26]
+	ldp	q28, q29, [x7, #16 * 28]
+	ldp	q30, q31, [x7, #16 * 30]
+
+	// SME?
+	cbz	x1, check_sve_in
+
+	adrp	x7, svcr_in
+	ldr	x7, [x7, :lo12:svcr_in]
+	// SVCR is 0 by default, avoid triggering SME if not in use
+	cbz	x7, check_sve_in
+	msr	S3_3_C4_C2_2, x7
+
+	// ZA?
+	tbz	x7, #SVCR_ZA_SHIFT, check_sm_in
+	rdsvl	11, 1
+	mov	w12, #0
+	ldr	x6, =za_in
+1:	_ldr_za 12, 6
+	add	x6, x6, x11
+	add	x12, x12, #1
+	cmp	x11, x12
+	bne	1b
+
+	// ZT?
+	cbz	x2, check_sm_in
+	adrp	x6, zt_in
+	add	x6, x6, :lo12:zt_in
+	_ldr_zt 6
+
+	// In streaming mode?
+check_sm_in:
+	tbz	x7, #SVCR_SM_SHIFT, check_sve_in
+	mov	x4, x3		// Load FFR if we have FA64
+	b	load_sve
+
+	// SVE?
+check_sve_in:
+	cbz	x0, wait_for_writes
+	mov	x4, #1
+
+load_sve:
+	ldr	x7, =z_in
+	ldr	z0, [x7, #0, MUL VL]
+	ldr	z1, [x7, #1, MUL VL]
+	ldr	z2, [x7, #2, MUL VL]
+	ldr	z3, [x7, #3, MUL VL]
+	ldr	z4, [x7, #4, MUL VL]
+	ldr	z5, [x7, #5, MUL VL]
+	ldr	z6, [x7, #6, MUL VL]
+	ldr	z7, [x7, #7, MUL VL]
+	ldr	z8, [x7, #8, MUL VL]
+	ldr	z9, [x7, #9, MUL VL]
+	ldr	z10, [x7, #10, MUL VL]
+	ldr	z11, [x7, #11, MUL VL]
+	ldr	z12, [x7, #12, MUL VL]
+	ldr	z13, [x7, #13, MUL VL]
+	ldr	z14, [x7, #14, MUL VL]
+	ldr	z15, [x7, #15, MUL VL]
+	ldr	z16, [x7, #16, MUL VL]
+	ldr	z17, [x7, #17, MUL VL]
+	ldr	z18, [x7, #18, MUL VL]
+	ldr	z19, [x7, #19, MUL VL]
+	ldr	z20, [x7, #20, MUL VL]
+	ldr	z21, [x7, #21, MUL VL]
+	ldr	z22, [x7, #22, MUL VL]
+	ldr	z23, [x7, #23, MUL VL]
+	ldr	z24, [x7, #24, MUL VL]
+	ldr	z25, [x7, #25, MUL VL]
+	ldr	z26, [x7, #26, MUL VL]
+	ldr	z27, [x7, #27, MUL VL]
+	ldr	z28, [x7, #28, MUL VL]
+	ldr	z29, [x7, #29, MUL VL]
+	ldr	z30, [x7, #30, MUL VL]
+	ldr	z31, [x7, #31, MUL VL]
+
+	// FFR is not present in base SME
+	cbz	x4, 1f
+	ldr	x7, =ffr_in
+	ldr	p0, [x7]
+	ldr	x7, [x7, #0]
+	cbz	x7, 1f
+	wrffr	p0.b
+1:
+
+	ldr	x7, =p_in
+	ldr	p0, [x7, #0, MUL VL]
+	ldr	p1, [x7, #1, MUL VL]
+	ldr	p2, [x7, #2, MUL VL]
+	ldr	p3, [x7, #3, MUL VL]
+	ldr	p4, [x7, #4, MUL VL]
+	ldr	p5, [x7, #5, MUL VL]
+	ldr	p6, [x7, #6, MUL VL]
+	ldr	p7, [x7, #7, MUL VL]
+	ldr	p8, [x7, #8, MUL VL]
+	ldr	p9, [x7, #9, MUL VL]
+	ldr	p10, [x7, #10, MUL VL]
+	ldr	p11, [x7, #11, MUL VL]
+	ldr	p12, [x7, #12, MUL VL]
+	ldr	p13, [x7, #13, MUL VL]
+	ldr	p14, [x7, #14, MUL VL]
+	ldr	p15, [x7, #15, MUL VL]
+
+wait_for_writes:
+	// Wait for the parent
+	brk #0
+
+	// Save values
+	ldr	x7, =v_out
+	stp	q0, q1, [x7]
+	stp	q2, q3, [x7, #16 * 2]
+	stp	q4, q5, [x7, #16 * 4]
+	stp	q6, q7, [x7, #16 * 6]
+	stp	q8, q9, [x7, #16 * 8]
+	stp	q10, q11, [x7, #16 * 10]
+	stp	q12, q13, [x7, #16 * 12]
+	stp	q14, q15, [x7, #16 * 14]
+	stp	q16, q17, [x7, #16 * 16]
+	stp	q18, q19, [x7, #16 * 18]
+	stp	q20, q21, [x7, #16 * 20]
+	stp	q22, q23, [x7, #16 * 22]
+	stp	q24, q25, [x7, #16 * 24]
+	stp	q26, q27, [x7, #16 * 26]
+	stp	q28, q29, [x7, #16 * 28]
+	stp	q30, q31, [x7, #16 * 30]
+
+	// SME?
+	cbz	x1, check_sve_out
+
+	rdsvl	11, 1
+	adrp	x6, sme_vl_out
+	str	x11, [x6, :lo12:sme_vl_out]
+
+	mrs	x7, S3_3_C4_C2_2
+	adrp	x6, svcr_out
+	str	x7, [x6, :lo12:svcr_out]
+
+	// ZA?
+	tbz	x7, #SVCR_ZA_SHIFT, check_sm_out
+	mov	w12, #0
+	ldr	x6, =za_out
+1:	_str_za 12, 6
+	add	x6, x6, x11
+	add	x12, x12, #1
+	cmp	x11, x12
+	bne	1b
+
+	// ZT?
+	cbz	x2, check_sm_out
+	adrp	x6, zt_out
+	add	x6, x6, :lo12:zt_out
+	_str_zt 6
+
+	// In streaming mode?
+check_sm_out:
+	tbz	x7, #SVCR_SM_SHIFT, check_sve_out
+	mov	x4, x3				// FFR?
+	b	read_sve
+
+	// SVE?
+check_sve_out:
+	cbz	x0, wait_for_reads
+	mov	x4, #1
+
+	rdvl	x7, #1
+	adrp	x6, sve_vl_out
+	str	x7, [x6, :lo12:sve_vl_out]
+
+read_sve:
+	ldr	x7, =z_out
+	str	z0, [x7, #0, MUL VL]
+	str	z1, [x7, #1, MUL VL]
+	str	z2, [x7, #2, MUL VL]
+	str	z3, [x7, #3, MUL VL]
+	str	z4, [x7, #4, MUL VL]
+	str	z5, [x7, #5, MUL VL]
+	str	z6, [x7, #6, MUL VL]
+	str	z7, [x7, #7, MUL VL]
+	str	z8, [x7, #8, MUL VL]
+	str	z9, [x7, #9, MUL VL]
+	str	z10, [x7, #10, MUL VL]
+	str	z11, [x7, #11, MUL VL]
+	str	z12, [x7, #12, MUL VL]
+	str	z13, [x7, #13, MUL VL]
+	str	z14, [x7, #14, MUL VL]
+	str	z15, [x7, #15, MUL VL]
+	str	z16, [x7, #16, MUL VL]
+	str	z17, [x7, #17, MUL VL]
+	str	z18, [x7, #18, MUL VL]
+	str	z19, [x7, #19, MUL VL]
+	str	z20, [x7, #20, MUL VL]
+	str	z21, [x7, #21, MUL VL]
+	str	z22, [x7, #22, MUL VL]
+	str	z23, [x7, #23, MUL VL]
+	str	z24, [x7, #24, MUL VL]
+	str	z25, [x7, #25, MUL VL]
+	str	z26, [x7, #26, MUL VL]
+	str	z27, [x7, #27, MUL VL]
+	str	z28, [x7, #28, MUL VL]
+	str	z29, [x7, #29, MUL VL]
+	str	z30, [x7, #30, MUL VL]
+	str	z31, [x7, #31, MUL VL]
+
+	ldr	x7, =p_out
+	str	p0, [x7, #0, MUL VL]
+	str	p1, [x7, #1, MUL VL]
+	str	p2, [x7, #2, MUL VL]
+	str	p3, [x7, #3, MUL VL]
+	str	p4, [x7, #4, MUL VL]
+	str	p5, [x7, #5, MUL VL]
+	str	p6, [x7, #6, MUL VL]
+	str	p7, [x7, #7, MUL VL]
+	str	p8, [x7, #8, MUL VL]
+	str	p9, [x7, #9, MUL VL]
+	str	p10, [x7, #10, MUL VL]
+	str	p11, [x7, #11, MUL VL]
+	str	p12, [x7, #12, MUL VL]
+	str	p13, [x7, #13, MUL VL]
+	str	p14, [x7, #14, MUL VL]
+	str	p15, [x7, #15, MUL VL]
+
+	// Only save FFR if it exists
+	cbz	x4, wait_for_reads
+	ldr	x7, =ffr_out
+	rdffr	p0.b
+	str	p0, [x7]
+
+wait_for_reads:
+	// Wait for the parent
+	brk #0
+
+	// Ensure we don't leave ourselves in streaming mode
+	cbz	x1, out
+	msr	S3_3_C4_C2_2, xzr
+
+out:
+	ldp	x11, x12, [sp, #-0x10]
+	ret
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
new file mode 100644
index 000000000000..c7ceafe5f471
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -0,0 +1,1503 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ * Original author: Mark Brown <broonie@kernel.org>
+ */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+
+#include <linux/kernel.h>
+
+#include <asm/sigcontext.h>
+#include <asm/sve_context.h>
+#include <asm/ptrace.h>
+
+#include "../../kselftest.h"
+
+#include "fp-ptrace.h"
+
+/* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
+#ifndef NT_ARM_SVE
+#define NT_ARM_SVE 0x405
+#endif
+
+#ifndef NT_ARM_SSVE
+#define NT_ARM_SSVE 0x40b
+#endif
+
+#ifndef NT_ARM_ZA
+#define NT_ARM_ZA 0x40c
+#endif
+
+#ifndef NT_ARM_ZT
+#define NT_ARM_ZT 0x40d
+#endif
+
+#define ARCH_VQ_MAX 256
+
+/* VL 128..2048 in powers of 2 */
+#define MAX_NUM_VLS 5
+
+#define NUM_FPR 32
+__uint128_t v_in[NUM_FPR];
+__uint128_t v_expected[NUM_FPR];
+__uint128_t v_out[NUM_FPR];
+
+char z_in[__SVE_ZREGS_SIZE(ARCH_VQ_MAX)];
+char z_expected[__SVE_ZREGS_SIZE(ARCH_VQ_MAX)];
+char z_out[__SVE_ZREGS_SIZE(ARCH_VQ_MAX)];
+
+char p_in[__SVE_PREGS_SIZE(ARCH_VQ_MAX)];
+char p_expected[__SVE_PREGS_SIZE(ARCH_VQ_MAX)];
+char p_out[__SVE_PREGS_SIZE(ARCH_VQ_MAX)];
+
+char ffr_in[__SVE_PREG_SIZE(ARCH_VQ_MAX)];
+char ffr_expected[__SVE_PREG_SIZE(ARCH_VQ_MAX)];
+char ffr_out[__SVE_PREG_SIZE(ARCH_VQ_MAX)];
+
+char za_in[ZA_SIG_REGS_SIZE(ARCH_VQ_MAX)];
+char za_expected[ZA_SIG_REGS_SIZE(ARCH_VQ_MAX)];
+char za_out[ZA_SIG_REGS_SIZE(ARCH_VQ_MAX)];
+
+char zt_in[ZT_SIG_REG_BYTES];
+char zt_expected[ZT_SIG_REG_BYTES];
+char zt_out[ZT_SIG_REG_BYTES];
+
+uint64_t sve_vl_out;
+uint64_t sme_vl_out;
+uint64_t svcr_in, svcr_expected, svcr_out;
+
+void load_and_save(int sve, int sme, int sme2, int fa64);
+
+static bool got_alarm;
+
+static void handle_alarm(int sig, siginfo_t *info, void *context)
+{
+	got_alarm = true;
+}
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+static __uint128_t arm64_cpu_to_le128(__uint128_t x)
+{
+	u64 a = swab64(x);
+	u64 b = swab64(x >> 64);
+
+	return ((__uint128_t)a << 64) | b;
+}
+#else
+static __uint128_t arm64_cpu_to_le128(__uint128_t x)
+{
+	return x;
+}
+#endif
+
+#define arm64_le128_to_cpu(x) arm64_cpu_to_le128(x)
+
+static bool sve_supported(void)
+{
+	return getauxval(AT_HWCAP) & HWCAP_SVE;
+}
+
+static bool sme_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME;
+}
+
+static bool sme2_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME2;
+}
+
+static bool fa64_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_SME_FA64;
+}
+
+static bool compare_buffer(const char *name, void *out,
+			   void *expected, size_t size)
+{
+	void *tmp;
+
+	if (memcmp(out, expected, size) == 0)
+		return true;
+
+	ksft_print_msg("Mismatch in %s\n", name);
+
+	/* Did we just get zeros back? */
+	tmp = malloc(size);
+	if (!tmp) {
+		ksft_print_msg("OOM allocating %lu bytes for %s\n",
+			       size, name);
+		ksft_exit_fail();
+	}
+	memset(tmp, 0, size);
+
+	if (memcmp(out, tmp, size) == 0)
+		ksft_print_msg("%s is zero\n", name);
+
+	free(tmp);
+
+	return false;
+}
+
+struct test_config {
+	int sve_vl_in;
+	int sve_vl_expected;
+	int sme_vl_in;
+	int sme_vl_expected;
+	int svcr_in;
+	int svcr_expected;
+};
+
+struct test_definition {
+	const char *name;
+	bool sve_vl_change;
+	bool (*supported)(struct test_config *config);
+	void (*set_expected_values)(struct test_config *config);
+	void (*modify_values)(pid_t child, struct test_config *test_config);
+};
+
+static int vl_in(struct test_config *config)
+{
+	int vl;
+
+	if (config->svcr_in & SVCR_SM)
+		vl = config->sme_vl_in;
+	else
+		vl = config->sve_vl_in;
+
+	return vl;
+}
+
+static int vl_expected(struct test_config *config)
+{
+	int vl;
+
+	if (config->svcr_expected & SVCR_SM)
+		vl = config->sme_vl_expected;
+	else
+		vl = config->sve_vl_expected;
+
+	return vl;
+}
+
+static void run_child(struct test_config *config)
+{
+	int ret;
+
+	/* Let the parent attach to us */
+	ret = ptrace(PTRACE_TRACEME, 0, 0, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg("PTRACE_TRACEME failed: %s (%d)\n",
+				   strerror(errno), errno);
+
+	/* VL setup */
+	if (sve_supported()) {
+		ret = prctl(PR_SVE_SET_VL, config->sve_vl_in);
+		if (ret != config->sve_vl_in) {
+			ksft_print_msg("Failed to set SVE VL %d: %d\n",
+				       config->sve_vl_in, ret);
+		}
+	}
+
+	if (sme_supported()) {
+		ret = prctl(PR_SME_SET_VL, config->sme_vl_in);
+		if (ret != config->sme_vl_in) {
+			ksft_print_msg("Failed to set SME VL %d: %d\n",
+				       config->sme_vl_in, ret);
+		}
+	}
+
+	/* Load values and wait for the parent */
+	load_and_save(sve_supported(), sme_supported(),
+		      sme2_supported(), fa64_supported());
+
+	exit(0);
+}
+
+static void read_one_child_regs(pid_t child, char *name,
+				struct iovec *iov_parent,
+				struct iovec *iov_child)
+{
+	int len = iov_parent->iov_len;
+	int ret;
+
+	ret = process_vm_readv(child, iov_parent, 1, iov_child, 1, 0);
+	if (ret == -1)
+		ksft_print_msg("%s read failed: %s (%d)\n",
+			       name, strerror(errno), errno);
+	else if (ret != len)
+		ksft_print_msg("Short read of %s: %d\n", name, ret);
+}
+
+static void read_child_regs(pid_t child)
+{
+	struct iovec iov_parent, iov_child;
+
+	/*
+	 * Since the child fork()ed from us the buffer addresses are
+	 * the same in parent and child.
+	 */
+	iov_parent.iov_base = &v_out;
+	iov_parent.iov_len = sizeof(v_out);
+	iov_child.iov_base = &v_out;
+	iov_child.iov_len = sizeof(v_out);
+	read_one_child_regs(child, "FPSIMD", &iov_parent, &iov_child);
+
+	if (sve_supported() || sme_supported()) {
+		iov_parent.iov_base = &sve_vl_out;
+		iov_parent.iov_len = sizeof(sve_vl_out);
+		iov_child.iov_base = &sve_vl_out;
+		iov_child.iov_len = sizeof(sve_vl_out);
+		read_one_child_regs(child, "SVE VL", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &z_out;
+		iov_parent.iov_len = sizeof(z_out);
+		iov_child.iov_base = &z_out;
+		iov_child.iov_len = sizeof(z_out);
+		read_one_child_regs(child, "Z", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &p_out;
+		iov_parent.iov_len = sizeof(p_out);
+		iov_child.iov_base = &p_out;
+		iov_child.iov_len = sizeof(p_out);
+		read_one_child_regs(child, "P", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &ffr_out;
+		iov_parent.iov_len = sizeof(ffr_out);
+		iov_child.iov_base = &ffr_out;
+		iov_child.iov_len = sizeof(ffr_out);
+		read_one_child_regs(child, "FFR", &iov_parent, &iov_child);
+	}
+
+	if (sme_supported()) {
+		iov_parent.iov_base = &sme_vl_out;
+		iov_parent.iov_len = sizeof(sme_vl_out);
+		iov_child.iov_base = &sme_vl_out;
+		iov_child.iov_len = sizeof(sme_vl_out);
+		read_one_child_regs(child, "SME VL", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &svcr_out;
+		iov_parent.iov_len = sizeof(svcr_out);
+		iov_child.iov_base = &svcr_out;
+		iov_child.iov_len = sizeof(svcr_out);
+		read_one_child_regs(child, "SVCR", &iov_parent, &iov_child);
+
+		iov_parent.iov_base = &za_out;
+		iov_parent.iov_len = sizeof(za_out);
+		iov_child.iov_base = &za_out;
+		iov_child.iov_len = sizeof(za_out);
+		read_one_child_regs(child, "ZA", &iov_parent, &iov_child);
+	}
+
+	if (sme2_supported()) {
+		iov_parent.iov_base = &zt_out;
+		iov_parent.iov_len = sizeof(zt_out);
+		iov_child.iov_base = &zt_out;
+		iov_child.iov_len = sizeof(zt_out);
+		read_one_child_regs(child, "ZT", &iov_parent, &iov_child);
+	}
+}
+
+static bool continue_breakpoint(pid_t child,
+				enum __ptrace_request restart_type)
+{
+	struct user_pt_regs pt_regs;
+	struct iovec iov;
+	int ret;
+
+	/* Get PC */
+	iov.iov_base = &pt_regs;
+	iov.iov_len = sizeof(pt_regs);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov);
+	if (ret < 0) {
+		ksft_print_msg("Failed to get PC: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	/* Skip over the BRK */
+	pt_regs.pc += 4;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, &iov);
+	if (ret < 0) {
+		ksft_print_msg("Failed to skip BRK: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	/* Restart */
+	ret = ptrace(restart_type, child, 0, 0);
+	if (ret < 0) {
+		ksft_print_msg("Failed to restart child: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	return true;
+}
+
+static bool check_ptrace_values_sve(pid_t child, struct test_config *config)
+{
+	struct user_sve_header *sve;
+	struct user_fpsimd_state *fpsimd;
+	struct iovec iov;
+	int ret, vq;
+	bool pass = true;
+
+	if (!sve_supported())
+		return true;
+
+	vq = __sve_vq_from_vl(config->sve_vl_in);
+
+	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("OOM allocating %lu byte SVE buffer\n",
+			       iov.iov_len);
+		return false;
+	}
+
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_SVE, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial SVE: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+		goto out;
+	}
+
+	sve = iov.iov_base;
+
+	if (sve->vl != config->sve_vl_in) {
+		ksft_print_msg("Mismatch in initial SVE VL: %d != %d\n",
+			       sve->vl, config->sve_vl_in);
+		pass = false;
+	}
+
+	/* If we are in streaming mode we should just read FPSIMD */
+	if ((config->svcr_in & SVCR_SM) && (sve->flags & SVE_PT_REGS_SVE)) {
+		ksft_print_msg("NT_ARM_SVE reports SVE with PSTATE.SM\n");
+		pass = false;
+	}
+
+	if (sve->size != SVE_PT_SIZE(vq, sve->flags)) {
+		ksft_print_msg("Mismatch in SVE header size: %d != %lu\n",
+			       sve->size, SVE_PT_SIZE(vq, sve->flags));
+		pass = false;
+	}
+
+	/* The registers might be in completely different formats! */
+	if (sve->flags & SVE_PT_REGS_SVE) {
+		if (!compare_buffer("initial SVE Z",
+				    iov.iov_base + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+				    z_in, SVE_PT_SVE_ZREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SVE P",
+				    iov.iov_base + SVE_PT_SVE_PREG_OFFSET(vq, 0),
+				    p_in, SVE_PT_SVE_PREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SVE FFR",
+				    iov.iov_base + SVE_PT_SVE_FFR_OFFSET(vq),
+				    ffr_in, SVE_PT_SVE_PREG_SIZE(vq)))
+			pass = false;
+	} else {
+		fpsimd = iov.iov_base + SVE_PT_FPSIMD_OFFSET;
+		if (!compare_buffer("initial V via SVE", &fpsimd->vregs[0],
+				    v_in, sizeof(v_in)))
+			pass = false;
+	}
+
+out:
+	free(iov.iov_base);
+	return pass;
+}
+
+static bool check_ptrace_values_ssve(pid_t child, struct test_config *config)
+{
+	struct user_sve_header *sve;
+	struct user_fpsimd_state *fpsimd;
+	struct iovec iov;
+	int ret, vq;
+	bool pass = true;
+
+	if (!sme_supported())
+		return true;
+
+	vq = __sve_vq_from_vl(config->sme_vl_in);
+
+	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("OOM allocating %lu byte SSVE buffer\n",
+			       iov.iov_len);
+		return false;
+	}
+
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_SSVE, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial SSVE: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+		goto out;
+	}
+
+	sve = iov.iov_base;
+
+	if (sve->vl != config->sme_vl_in) {
+		ksft_print_msg("Mismatch in initial SSVE VL: %d != %d\n",
+			       sve->vl, config->sme_vl_in);
+		pass = false;
+	}
+
+	if ((config->svcr_in & SVCR_SM) && !(sve->flags & SVE_PT_REGS_SVE)) {
+		ksft_print_msg("NT_ARM_SSVE reports FPSIMD with PSTATE.SM\n");
+		pass = false;
+	}
+
+	if (sve->size != SVE_PT_SIZE(vq, sve->flags)) {
+		ksft_print_msg("Mismatch in SSVE header size: %d != %lu\n",
+			       sve->size, SVE_PT_SIZE(vq, sve->flags));
+		pass = false;
+	}
+
+	/* The registers might be in completely different formats! */
+	if (sve->flags & SVE_PT_REGS_SVE) {
+		if (!compare_buffer("initial SSVE Z",
+				    iov.iov_base + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+				    z_in, SVE_PT_SVE_ZREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SSVE P",
+				    iov.iov_base + SVE_PT_SVE_PREG_OFFSET(vq, 0),
+				    p_in, SVE_PT_SVE_PREGS_SIZE(vq)))
+			pass = false;
+
+		if (!compare_buffer("initial SSVE FFR",
+				    iov.iov_base + SVE_PT_SVE_FFR_OFFSET(vq),
+				    ffr_in, SVE_PT_SVE_PREG_SIZE(vq)))
+			pass = false;
+	} else {
+		fpsimd = iov.iov_base + SVE_PT_FPSIMD_OFFSET;
+		if (!compare_buffer("initial V via SSVE",
+				    &fpsimd->vregs[0], v_in, sizeof(v_in)))
+			pass = false;
+	}
+
+out:
+	free(iov.iov_base);
+	return pass;
+}
+
+static bool check_ptrace_values_za(pid_t child, struct test_config *config)
+{
+	struct user_za_header *za;
+	struct iovec iov;
+	int ret, vq;
+	bool pass = true;
+
+	if (!sme_supported())
+		return true;
+
+	vq = __sve_vq_from_vl(config->sme_vl_in);
+
+	iov.iov_len = ZA_SIG_CONTEXT_SIZE(vq);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("OOM allocating %lu byte ZA buffer\n",
+			       iov.iov_len);
+		return false;
+	}
+
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_ZA, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial ZA: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+		goto out;
+	}
+
+	za = iov.iov_base;
+
+	if (za->vl != config->sme_vl_in) {
+		ksft_print_msg("Mismatch in initial SME VL: %d != %d\n",
+			       za->vl, config->sme_vl_in);
+		pass = false;
+	}
+
+	/* If PSTATE.ZA is not set we should just read the header */
+	if (config->svcr_in & SVCR_ZA) {
+		if (za->size != ZA_PT_SIZE(vq)) {
+			ksft_print_msg("Unexpected ZA ptrace read size: %d != %lu\n",
+				       za->size, ZA_PT_SIZE(vq));
+			pass = false;
+		}
+
+		if (!compare_buffer("initial ZA",
+				    iov.iov_base + ZA_PT_ZA_OFFSET,
+				    za_in, ZA_PT_ZA_SIZE(vq)))
+			pass = false;
+	} else {
+		if (za->size != sizeof(*za)) {
+			ksft_print_msg("Unexpected ZA ptrace read size: %d != %lu\n",
+				       za->size, sizeof(*za));
+			pass = false;
+		}
+	}
+
+out:
+	free(iov.iov_base);
+	return pass;
+}
+
+static bool check_ptrace_values_zt(pid_t child, struct test_config *config)
+{
+	uint8_t buf[512];
+	struct iovec iov;
+	int ret;
+
+	if (!sme2_supported())
+		return true;
+
+	iov.iov_base = &buf;
+	iov.iov_len = ZT_SIG_REG_BYTES;
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_ZT, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial ZT: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	return compare_buffer("initial ZT", buf, zt_in, ZT_SIG_REG_BYTES);
+}
+
+
+static bool check_ptrace_values(pid_t child, struct test_config *config)
+{
+	bool pass = true;
+	struct user_fpsimd_state fpsimd;
+	struct iovec iov;
+	int ret;
+
+	iov.iov_base = &fpsimd;
+	iov.iov_len = sizeof(fpsimd);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_PRFPREG, &iov);
+	if (ret == 0) {
+		if (!compare_buffer("initial V", &fpsimd.vregs, v_in,
+				    sizeof(v_in))) {
+			pass = false;
+		}
+	} else {
+		ksft_print_msg("Failed to read initial V: %s (%d)\n",
+			       strerror(errno), errno);
+		pass = false;
+	}
+
+	if (!check_ptrace_values_sve(child, config))
+		pass = false;
+
+	if (!check_ptrace_values_ssve(child, config))
+		pass = false;
+
+	if (!check_ptrace_values_za(child, config))
+		pass = false;
+
+	if (!check_ptrace_values_zt(child, config))
+		pass = false;
+
+	return pass;
+}
+
+static bool run_parent(pid_t child, struct test_definition *test,
+		       struct test_config *config)
+{
+	int wait_status, ret;
+	pid_t pid;
+	bool pass;
+
+	/* Initial attach */
+	while (1) {
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+
+	if (WIFEXITED(wait_status)) {
+		ksft_print_msg("Child exited loading values with status %d\n",
+			       WEXITSTATUS(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	if (WIFSIGNALED(wait_status)) {
+		ksft_print_msg("Child died from signal %d loading values\n",
+			       WTERMSIG(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	/* Read initial values via ptrace */
+	pass = check_ptrace_values(child, config);
+
+	/* Do whatever writes we want to do */
+	if (test->modify_values)
+		test->modify_values(child, config);
+
+	if (!continue_breakpoint(child, PTRACE_CONT))
+		goto cleanup;
+
+	while (1) {
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+
+	if (WIFEXITED(wait_status)) {
+		ksft_print_msg("Child exited saving values with status %d\n",
+			       WEXITSTATUS(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	if (WIFSIGNALED(wait_status)) {
+		ksft_print_msg("Child died from signal %d saving values\n",
+			       WTERMSIG(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	/* See what happened as a result */
+	read_child_regs(child);
+
+	if (!continue_breakpoint(child, PTRACE_DETACH))
+		goto cleanup;
+
+	/* The child should exit cleanly */
+	got_alarm = false;
+	alarm(1);
+	while (1) {
+		if (got_alarm) {
+			ksft_print_msg("Wait for child timed out\n");
+			goto cleanup;
+		}
+
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+	alarm(0);
+
+	if (got_alarm) {
+		ksft_print_msg("Timed out waiting for child\n");
+		pass = false;
+		goto cleanup;
+	}
+
+	if (pid == child && WIFSIGNALED(wait_status)) {
+		ksft_print_msg("Child died from signal %d cleaning up\n",
+			       WTERMSIG(wait_status));
+		pass = false;
+		goto out;
+	}
+
+	if (pid == child && WIFEXITED(wait_status)) {
+		if (WEXITSTATUS(wait_status) != 0) {
+			ksft_print_msg("Child exited with error %d\n",
+				       WEXITSTATUS(wait_status));
+			pass = false;
+		}
+	} else {
+		ksft_print_msg("Child did not exit cleanly\n");
+		pass = false;
+		goto cleanup;
+	}
+
+	goto out;
+
+cleanup:
+	ret = kill(child, SIGKILL);
+	if (ret != 0) {
+		ksft_print_msg("kill() failed: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	while (1) {
+		pid = waitpid(child, &wait_status, 0);
+		if (pid < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("waitpid() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		if (pid == child)
+			break;
+	}
+
+out:
+	return pass;
+}
+
+static void fill_random(void *buf, size_t size)
+{
+	int i;
+	uint32_t *lbuf = buf;
+
+	/* random() returns a 32 bit number regardless of the size of long */
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		lbuf[i] = random();
+}
+
+static void fill_random_ffr(void *buf, size_t vq)
+{
+	uint8_t *lbuf = buf;
+	int bits, i;
+
+	/*
+	 * Only values with a continuous set of 0..n bits set are
+	 * valid for FFR, set all bits then clear a random number of
+	 * high bits.
+	 */
+	memset(buf, 0, __SVE_FFR_SIZE(vq));
+
+	bits = random() % (__SVE_FFR_SIZE(vq) * 8);
+	for (i = 0; i < bits / 8; i++)
+		lbuf[i] = 0xff;
+	if (bits / 8 != __SVE_FFR_SIZE(vq))
+		lbuf[i] = (1 << (bits % 8)) - 1;
+}
+
+static void fpsimd_to_sve(__uint128_t *v, char *z, int vl)
+{
+	int vq = __sve_vq_from_vl(vl);
+	int i;
+	__uint128_t *p;
+
+	if (!vl)
+		return;
+
+	for (i = 0; i < __SVE_NUM_ZREGS; i++) {
+		p = (__uint128_t *)&z[__SVE_ZREG_OFFSET(vq, i)];
+		*p = arm64_cpu_to_le128(v[i]);
+	}
+}
+
+static void set_initial_values(struct test_config *config)
+{
+	int vq = __sve_vq_from_vl(vl_in(config));
+	int sme_vq = __sve_vq_from_vl(config->sme_vl_in);
+
+	svcr_in = config->svcr_in;
+	svcr_expected = config->svcr_expected;
+	svcr_out = 0;
+
+	fill_random(&v_in, sizeof(v_in));
+	memcpy(v_expected, v_in, sizeof(v_in));
+	memset(v_out, 0, sizeof(v_out));
+
+	/* Changes will be handled in the test case */
+	if (sve_supported() || (config->svcr_in & SVCR_SM)) {
+		/* The low 128 bits of Z are shared with the V registers */
+		fill_random(&z_in, __SVE_ZREGS_SIZE(vq));
+		fpsimd_to_sve(v_in, z_in, vl_in(config));
+		memcpy(z_expected, z_in, __SVE_ZREGS_SIZE(vq));
+		memset(z_out, 0, sizeof(z_out));
+
+		fill_random(&p_in, __SVE_PREGS_SIZE(vq));
+		memcpy(p_expected, p_in, __SVE_PREGS_SIZE(vq));
+		memset(p_out, 0, sizeof(p_out));
+
+		if ((config->svcr_in & SVCR_SM) && !fa64_supported())
+			memset(ffr_in, 0, __SVE_PREG_SIZE(vq));
+		else
+			fill_random_ffr(&ffr_in, vq);
+		memcpy(ffr_expected, ffr_in, __SVE_PREG_SIZE(vq));
+		memset(ffr_out, 0, __SVE_PREG_SIZE(vq));
+	}
+
+	if (config->svcr_in & SVCR_ZA)
+		fill_random(za_in, ZA_SIG_REGS_SIZE(sme_vq));
+	else
+		memset(za_in, 0, ZA_SIG_REGS_SIZE(sme_vq));
+	if (config->svcr_expected & SVCR_ZA)
+		memcpy(za_expected, za_in, ZA_SIG_REGS_SIZE(sme_vq));
+	else
+		memset(za_expected, 0, ZA_SIG_REGS_SIZE(sme_vq));
+	if (sme_supported())
+		memset(za_out, 0, sizeof(za_out));
+
+	if (sme2_supported()) {
+		if (config->svcr_in & SVCR_ZA)
+			fill_random(zt_in, ZT_SIG_REG_BYTES);
+		else
+			memset(zt_in, 0, ZT_SIG_REG_BYTES);
+		if (config->svcr_expected & SVCR_ZA)
+			memcpy(zt_expected, zt_in, ZT_SIG_REG_BYTES);
+		else
+			memset(zt_expected, 0, ZT_SIG_REG_BYTES);
+		memset(zt_out, 0, sizeof(zt_out));
+	}
+}
+
+static bool check_memory_values(struct test_config *config)
+{
+	bool pass = true;
+	int vq, sme_vq;
+
+	if (!compare_buffer("saved V", v_out, v_expected, sizeof(v_out)))
+		pass = false;
+
+	vq = __sve_vq_from_vl(vl_expected(config));
+	sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (svcr_out != svcr_expected) {
+		ksft_print_msg("Mismatch in saved SVCR %lx != %lx\n",
+			       svcr_out, svcr_expected);
+		pass = false;
+	}
+
+	if (sve_vl_out != config->sve_vl_expected) {
+		ksft_print_msg("Mismatch in SVE VL: %ld != %d\n",
+			       sve_vl_out, config->sve_vl_expected);
+		pass = false;
+	}
+
+	if (sme_vl_out != config->sme_vl_expected) {
+		ksft_print_msg("Mismatch in SME VL: %ld != %d\n",
+			       sme_vl_out, config->sme_vl_expected);
+		pass = false;
+	}
+
+	if (!compare_buffer("saved Z", z_out, z_expected,
+			    __SVE_ZREGS_SIZE(vq)))
+		pass = false;
+
+	if (!compare_buffer("saved P", p_out, p_expected,
+			    __SVE_PREGS_SIZE(vq)))
+		pass = false;
+
+	if (!compare_buffer("saved FFR", ffr_out, ffr_expected,
+			    __SVE_PREG_SIZE(vq)))
+		pass = false;
+
+	if (!compare_buffer("saved ZA", za_out, za_expected,
+			    ZA_PT_ZA_SIZE(sme_vq)))
+		pass = false;
+
+	if (!compare_buffer("saved ZT", zt_out, zt_expected, ZT_SIG_REG_BYTES))
+		pass = false;
+
+	return pass;
+}
+
+static bool sve_sme_same(struct test_config *config)
+{
+	if (config->sve_vl_in != config->sve_vl_expected)
+		return false;
+
+	if (config->sme_vl_in != config->sme_vl_expected)
+		return false;
+
+	if (config->svcr_in != config->svcr_expected)
+		return false;
+
+	return true;
+}
+
+static bool sve_write_supported(struct test_config *config)
+{
+	if (!sve_supported() && !sme_supported())
+		return false;
+
+	if ((config->svcr_in & SVCR_ZA) != (config->svcr_expected & SVCR_ZA))
+		return false;
+
+	if (config->svcr_expected & SVCR_SM) {
+		if (config->sve_vl_in != config->sve_vl_expected) {
+			return false;
+		}
+
+		/* Changing the SME VL disables ZA */
+		if ((config->svcr_expected & SVCR_ZA) &&
+		    (config->sme_vl_in != config->sme_vl_expected)) {
+			return false;
+		}
+	} else {
+		if (config->sme_vl_in != config->sme_vl_expected) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static void fpsimd_write_expected(struct test_config *config)
+{
+	int vl;
+
+	fill_random(&v_expected, sizeof(v_expected));
+
+	/* The SVE registers are flushed by a FPSIMD write */
+	vl = vl_expected(config);
+
+	memset(z_expected, 0, __SVE_ZREGS_SIZE(__sve_vq_from_vl(vl)));
+	memset(p_expected, 0, __SVE_PREGS_SIZE(__sve_vq_from_vl(vl)));
+	memset(ffr_expected, 0, __SVE_PREG_SIZE(__sve_vq_from_vl(vl)));
+
+	fpsimd_to_sve(v_expected, z_expected, vl);
+}
+
+static void fpsimd_write(pid_t child, struct test_config *test_config)
+{
+	struct user_fpsimd_state fpsimd;
+	struct iovec iov;
+	int ret;
+
+	memset(&fpsimd, 0, sizeof(fpsimd));
+	memcpy(&fpsimd.vregs, v_expected, sizeof(v_expected));
+
+	iov.iov_base = &fpsimd;
+	iov.iov_len = sizeof(fpsimd);
+	ret = ptrace(PTRACE_SETREGSET, child, NT_PRFPREG, &iov);
+	if (ret == -1)
+		ksft_print_msg("FPSIMD set failed: (%s) %d\n",
+			       strerror(errno), errno);
+}
+
+static void sve_write_expected(struct test_config *config)
+{
+	int vl = vl_expected(config);
+	int sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	fill_random(z_expected, __SVE_ZREGS_SIZE(__sve_vq_from_vl(vl)));
+	fill_random(p_expected, __SVE_PREGS_SIZE(__sve_vq_from_vl(vl)));
+
+	if ((svcr_expected & SVCR_SM) && !fa64_supported())
+		memset(ffr_expected, 0, __SVE_PREG_SIZE(sme_vq));
+	else
+		fill_random_ffr(ffr_expected, __sve_vq_from_vl(vl));
+
+	/* Share the low bits of Z with V */
+	fill_random(&v_expected, sizeof(v_expected));
+	fpsimd_to_sve(v_expected, z_expected, vl);
+
+	if (config->sme_vl_in != config->sme_vl_expected) {
+		memset(za_expected, 0, ZA_PT_ZA_SIZE(sme_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+	}
+}
+
+static void sve_write(pid_t child, struct test_config *config)
+{
+	struct user_sve_header *sve;
+	struct iovec iov;
+	int ret, vl, vq, regset;
+
+	vl = vl_expected(config);
+	vq = __sve_vq_from_vl(vl);
+
+	iov.iov_len = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("Failed allocating %lu byte SVE write buffer\n",
+			       iov.iov_len);
+		return;
+	}
+	memset(iov.iov_base, 0, iov.iov_len);
+
+	sve = iov.iov_base;
+	sve->size = iov.iov_len;
+	sve->flags = SVE_PT_REGS_SVE;
+	sve->vl = vl;
+
+	memcpy(iov.iov_base + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+	       z_expected, SVE_PT_SVE_ZREGS_SIZE(vq));
+	memcpy(iov.iov_base + SVE_PT_SVE_PREG_OFFSET(vq, 0),
+	       p_expected, SVE_PT_SVE_PREGS_SIZE(vq));
+	memcpy(iov.iov_base + SVE_PT_SVE_FFR_OFFSET(vq),
+	       ffr_expected, SVE_PT_SVE_PREG_SIZE(vq));
+
+	if (svcr_expected & SVCR_SM)
+		regset = NT_ARM_SSVE;
+	else
+		regset = NT_ARM_SVE;
+
+	ret = ptrace(PTRACE_SETREGSET, child, regset, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write SVE: %s (%d)\n",
+			       strerror(errno), errno);
+
+	free(iov.iov_base);
+}
+
+static bool za_write_supported(struct test_config *config)
+{
+	if (config->svcr_expected & SVCR_SM) {
+		if (!(config->svcr_in & SVCR_SM))
+			return false;
+
+		/* Changing the SME VL exits streaming mode */
+		if (config->sme_vl_in != config->sme_vl_expected) {
+			return false;
+		}
+	}
+
+	/* Can't disable SM outside a VL change */
+	if ((config->svcr_in & SVCR_SM) &&
+	    !(config->svcr_expected & SVCR_SM))
+		return false;
+
+	return true;
+}
+
+static void za_write_expected(struct test_config *config)
+{
+	int sme_vq, sve_vq;
+
+	sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (config->svcr_expected & SVCR_ZA) {
+		fill_random(za_expected, ZA_PT_ZA_SIZE(sme_vq));
+	} else {
+		memset(za_expected, 0, ZA_PT_ZA_SIZE(sme_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+	}
+
+	/* Changing the SME VL flushes ZT, SVE state and exits SM */
+	if (config->sme_vl_in != config->sme_vl_expected) {
+		svcr_expected &= ~SVCR_SM;
+
+		sve_vq = __sve_vq_from_vl(vl_expected(config));
+		memset(z_expected, 0, __SVE_ZREGS_SIZE(sve_vq));
+		memset(p_expected, 0, __SVE_PREGS_SIZE(sve_vq));
+		memset(ffr_expected, 0, __SVE_PREG_SIZE(sve_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+
+		fpsimd_to_sve(v_expected, z_expected, vl_expected(config));
+	}
+}
+
+static void za_write(pid_t child, struct test_config *config)
+{
+	struct user_za_header *za;
+	struct iovec iov;
+	int ret, vq;
+
+	vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (config->svcr_expected & SVCR_ZA)
+		iov.iov_len = ZA_PT_SIZE(vq);
+	else
+		iov.iov_len = sizeof(*za);
+	iov.iov_base = malloc(iov.iov_len);
+	if (!iov.iov_base) {
+		ksft_print_msg("Failed allocating %lu byte ZA write buffer\n",
+			       iov.iov_len);
+		return;
+	}
+	memset(iov.iov_base, 0, iov.iov_len);
+
+	za = iov.iov_base;
+	za->size = iov.iov_len;
+	za->vl = config->sme_vl_expected;
+	if (config->svcr_expected & SVCR_ZA)
+		memcpy(iov.iov_base + ZA_PT_ZA_OFFSET, za_expected,
+		       ZA_PT_ZA_SIZE(vq));
+
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_ZA, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write ZA: %s (%d)\n",
+			       strerror(errno), errno);
+
+	free(iov.iov_base);
+}
+
+static bool zt_write_supported(struct test_config *config)
+{
+	if (!sme2_supported())
+		return false;
+	if (config->sme_vl_in != config->sme_vl_expected)
+		return false;
+	if (!(config->svcr_expected & SVCR_ZA))
+		return false;
+	if ((config->svcr_in & SVCR_SM) != (config->svcr_expected & SVCR_SM))
+		return false;
+
+	return true;
+}
+
+static void zt_write_expected(struct test_config *config)
+{
+	int sme_vq;
+
+	sme_vq = __sve_vq_from_vl(config->sme_vl_expected);
+
+	if (config->svcr_expected & SVCR_ZA) {
+		fill_random(zt_expected, sizeof(zt_expected));
+	} else {
+		memset(za_expected, 0, ZA_PT_ZA_SIZE(sme_vq));
+		memset(zt_expected, 0, sizeof(zt_expected));
+	}
+}
+
+static void zt_write(pid_t child, struct test_config *config)
+{
+	struct iovec iov;
+	int ret;
+
+	iov.iov_len = ZT_SIG_REG_BYTES;
+	iov.iov_base = zt_expected;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_ZT, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write ZT: %s (%d)\n",
+			       strerror(errno), errno);
+}
+
+/* Actually run a test */
+static void run_test(struct test_definition *test, struct test_config *config)
+{
+	pid_t child;
+	char name[1024];
+	bool pass;
+
+	if (sve_supported() && sme_supported())
+		snprintf(name, sizeof(name), "%s, SVE %d->%d, SME %d/%x->%d/%x",
+			 test->name,
+			 config->sve_vl_in, config->sve_vl_expected,
+			 config->sme_vl_in, config->svcr_in,
+			 config->sme_vl_expected, config->svcr_expected);
+	else if (sve_supported())
+		snprintf(name, sizeof(name), "%s, SVE %d->%d", test->name,
+			 config->sve_vl_in, config->sve_vl_expected);
+	else if (sme_supported())
+		snprintf(name, sizeof(name), "%s, SME %d/%x->%d/%x",
+			 test->name,
+			 config->sme_vl_in, config->svcr_in,
+			 config->sme_vl_expected, config->svcr_expected);
+	else
+		snprintf(name, sizeof(name), "%s", test->name);
+
+	if (test->supported && !test->supported(config)) {
+		ksft_test_result_skip("%s\n", name);
+		return;
+	}
+
+	set_initial_values(config);
+
+	if (test->set_expected_values)
+		test->set_expected_values(config);
+
+	child = fork();
+	if (child < 0)
+		ksft_exit_fail_msg("fork() failed: %s (%d)\n",
+				   strerror(errno), errno);
+	/* run_child() never returns */
+	if (child == 0)
+		run_child(config);
+
+	pass = run_parent(child, test, config);
+	if (!check_memory_values(config))
+		pass = false;
+
+	ksft_test_result(pass, "%s\n", name);
+}
+
+static void run_tests(struct test_definition defs[], int count,
+		      struct test_config *config)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		run_test(&defs[i], config);
+}
+
+static struct test_definition base_test_defs[] = {
+	{
+		.name = "No writes",
+		.supported = sve_sme_same,
+	},
+	{
+		.name = "FPSIMD write",
+		.supported = sve_sme_same,
+		.set_expected_values = fpsimd_write_expected,
+		.modify_values = fpsimd_write,
+	},
+};
+
+static struct test_definition sve_test_defs[] = {
+	{
+		.name = "SVE write",
+		.supported = sve_write_supported,
+		.set_expected_values = sve_write_expected,
+		.modify_values = sve_write,
+	},
+};
+
+static struct test_definition za_test_defs[] = {
+	{
+		.name = "ZA write",
+		.supported = za_write_supported,
+		.set_expected_values = za_write_expected,
+		.modify_values = za_write,
+	},
+};
+
+static struct test_definition zt_test_defs[] = {
+	{
+		.name = "ZT write",
+		.supported = zt_write_supported,
+		.set_expected_values = zt_write_expected,
+		.modify_values = zt_write,
+	},
+};
+
+static int sve_vls[MAX_NUM_VLS], sme_vls[MAX_NUM_VLS];
+static int sve_vl_count, sme_vl_count;
+
+static void probe_vls(const char *name, int vls[], int *vl_count, int set_vl)
+{
+	unsigned int vq;
+	int vl;
+
+	*vl_count = 0;
+
+	for (vq = ARCH_VQ_MAX; vq > 0; vq /= 2) {
+		vl = prctl(set_vl, vq * 16);
+		if (vl == -1)
+			ksft_exit_fail_msg("SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		vl &= PR_SVE_VL_LEN_MASK;
+
+		if (*vl_count && (vl == vls[*vl_count - 1]))
+			break;
+
+		vq = sve_vq_from_vl(vl);
+
+		vls[*vl_count] = vl;
+		*vl_count += 1;
+	}
+
+	if (*vl_count > 2) {
+		/* Just use the minimum and maximum */
+		vls[1] = vls[*vl_count - 1];
+		ksft_print_msg("%d %s VLs, using %d and %d\n",
+			       *vl_count, name, vls[0], vls[1]);
+		*vl_count = 2;
+	} else {
+		ksft_print_msg("%d %s VLs\n", *vl_count, name);
+	}
+}
+
+static struct {
+	int svcr_in, svcr_expected;
+} svcr_combinations[] = {
+	{ .svcr_in = 0, .svcr_expected = 0, },
+	{ .svcr_in = 0, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = 0, .svcr_expected = SVCR_ZA, },
+	/* Can't enable both SM and ZA with a single ptrace write */
+
+	{ .svcr_in = SVCR_SM, .svcr_expected = 0, },
+	{ .svcr_in = SVCR_SM, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = SVCR_SM, .svcr_expected = SVCR_ZA, },
+	{ .svcr_in = SVCR_SM, .svcr_expected = SVCR_SM | SVCR_ZA, },
+
+	{ .svcr_in = SVCR_ZA, .svcr_expected = 0, },
+	{ .svcr_in = SVCR_ZA, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = SVCR_ZA, .svcr_expected = SVCR_ZA, },
+	{ .svcr_in = SVCR_ZA, .svcr_expected = SVCR_SM | SVCR_ZA, },
+
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = 0, },
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = SVCR_SM, },
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = SVCR_ZA, },
+	{ .svcr_in = SVCR_SM | SVCR_ZA, .svcr_expected = SVCR_SM | SVCR_ZA, },
+};
+
+static void run_sve_tests(void)
+{
+	struct test_config test_config;
+	int i, j;
+
+	if (!sve_supported())
+		return;
+
+	test_config.sme_vl_in = sme_vls[0];
+	test_config.sme_vl_expected = sme_vls[0];
+	test_config.svcr_in = 0;
+	test_config.svcr_expected = 0;
+
+	for (i = 0; i < sve_vl_count; i++) {
+		test_config.sve_vl_in = sve_vls[i];
+
+		for (j = 0; j < sve_vl_count; j++) {
+			test_config.sve_vl_expected = sve_vls[j];
+
+			run_tests(base_test_defs,
+				  ARRAY_SIZE(base_test_defs),
+				  &test_config);
+			if (sve_supported())
+				run_tests(sve_test_defs,
+					  ARRAY_SIZE(sve_test_defs),
+					  &test_config);
+		}
+	}
+
+}
+
+static void run_sme_tests(void)
+{
+	struct test_config test_config;
+	int i, j, k;
+
+	if (!sme_supported())
+		return;
+
+	test_config.sve_vl_in = sve_vls[0];
+	test_config.sve_vl_expected = sve_vls[0];
+
+	/*
+	 * Every SME VL/SVCR combination
+	 */
+	for (i = 0; i < sme_vl_count; i++) {
+		test_config.sme_vl_in = sme_vls[i];
+
+		for (j = 0; j < sme_vl_count; j++) {
+			test_config.sme_vl_expected = sme_vls[j];
+
+			for (k = 0; k < ARRAY_SIZE(svcr_combinations); k++) {
+				test_config.svcr_in = svcr_combinations[k].svcr_in;
+				test_config.svcr_expected = svcr_combinations[k].svcr_expected;
+
+				run_tests(base_test_defs,
+					  ARRAY_SIZE(base_test_defs),
+					  &test_config);
+				run_tests(sve_test_defs,
+					  ARRAY_SIZE(sve_test_defs),
+					  &test_config);
+				run_tests(za_test_defs,
+					  ARRAY_SIZE(za_test_defs),
+					  &test_config);
+
+				if (sme2_supported())
+					run_tests(zt_test_defs,
+						  ARRAY_SIZE(zt_test_defs),
+						  &test_config);
+			}
+		}
+	}
+}
+
+int main(void)
+{
+	struct test_config test_config;
+	struct sigaction sa;
+	int tests, ret, tmp;
+
+	srandom(getpid());
+
+	ksft_print_header();
+
+	if (sve_supported()) {
+		probe_vls("SVE", sve_vls, &sve_vl_count, PR_SVE_SET_VL);
+
+		tests = ARRAY_SIZE(base_test_defs) +
+			ARRAY_SIZE(sve_test_defs);
+		tests *= sve_vl_count * sve_vl_count;
+	} else {
+		/* Only run the FPSIMD tests */
+		sve_vl_count = 1;
+		tests = ARRAY_SIZE(base_test_defs);
+	}
+
+	if (sme_supported()) {
+		probe_vls("SME", sme_vls, &sme_vl_count, PR_SME_SET_VL);
+
+		tmp = ARRAY_SIZE(base_test_defs) + ARRAY_SIZE(sve_test_defs)
+			+ ARRAY_SIZE(za_test_defs);
+
+		if (sme2_supported())
+			tmp += ARRAY_SIZE(zt_test_defs);
+
+		tmp *= sme_vl_count * sme_vl_count;
+		tmp *= ARRAY_SIZE(svcr_combinations);
+		tests += tmp;
+	} else {
+		sme_vl_count = 1;
+	}
+
+	if (sme2_supported())
+		ksft_print_msg("SME2 supported\n");
+
+	if (fa64_supported())
+		ksft_print_msg("FA64 supported\n");
+
+	ksft_set_plan(tests);
+
+	/* Get signal handers ready before we start any children */
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handle_alarm;
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+	sigemptyset(&sa.sa_mask);
+	ret = sigaction(SIGALRM, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGALRM handler: %s (%d)\n",
+			       strerror(errno), errno);
+
+	/*
+	 * Run the test set if there is no SVE or SME, with those we
+	 * have to pick a VL for each run.
+	 */
+	if (!sve_supported()) {
+		test_config.sve_vl_in = 0;
+		test_config.sve_vl_expected = 0;
+		test_config.sme_vl_in = 0;
+		test_config.sme_vl_expected = 0;
+		test_config.svcr_in = 0;
+		test_config.svcr_expected = 0;
+
+		run_tests(base_test_defs, ARRAY_SIZE(base_test_defs),
+			  &test_config);
+	}
+
+	run_sve_tests();
+	run_sme_tests();
+
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
new file mode 100644
index 000000000000..db4f2c4d750c
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021-3 ARM Limited.
+
+#ifndef FP_PTRACE_H
+#define FP_PTRACE_H
+
+#define SVCR_SM_SHIFT 0
+#define SVCR_ZA_SHIFT 1
+
+#define SVCR_SM (1 << SVCR_SM_SHIFT)
+#define SVCR_ZA (1 << SVCR_ZA_SHIFT)
+
+#endif

From b4725d3e460349aa55836a2a128009c578215b90 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Jan 2024 18:12:35 +0000
Subject: [PATCH 086/134] arm64/sve: Remove bitrotted comment about syscall
 behaviour

When we documented that we always clear state not shared with FPSIMD we
didn't catch all of the places that mentioned that state might not be
cleared, remove a lingering reference.

Reported-by: Edmund Grimley-Evans <edmund.grimley-evans@arm.com>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240124-arm64-sve-sme-doc-v2-1-fe3964fb3c19@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/sve.rst | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst
index 0d9a426e9f85..b45a2da19bf1 100644
--- a/Documentation/arch/arm64/sve.rst
+++ b/Documentation/arch/arm64/sve.rst
@@ -117,11 +117,6 @@ the SVE instruction set architecture.
 * The SVE registers are not used to pass arguments to or receive results from
   any syscall.
 
-* In practice the affected registers/bits will be preserved or will be replaced
-  with zeros on return from a syscall, but userspace should not make
-  assumptions about this.  The kernel behaviour may vary on a case-by-case
-  basis.
-
 * All other SVE state of a thread, including the currently configured vector
   length, the state of the PR_SVE_VL_INHERIT flag, and the deferred vector
   length (if any), is preserved across all syscalls, subject to the specific

From ae35792764bcba1dc13d2de62480eae8b97c2c41 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Jan 2024 18:12:36 +0000
Subject: [PATCH 087/134] arm64/sme: Fix cut'n'paste in ABI document

The ABI for SME is very like that for SVE so bits of the ABI were copied
but not adequately search and replaced, fix that.

Reported-by: Edmund Grimley-Evans <edmund.grimley-evans@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Link: https://lore.kernel.org/r/20240124-arm64-sve-sme-doc-v2-2-fe3964fb3c19@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/sme.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index 3d0e53ecac4f..3133d0e91b48 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -238,12 +238,12 @@ prctl(PR_SME_SET_VL, unsigned long arg)
       bits of Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
       unspecified, including both streaming and non-streaming SVE state.
       Calling PR_SME_SET_VL with vl equal to the thread's current vector
-      length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+      length, or calling PR_SME_SET_VL with the PR_SME_SET_VL_ONEXEC flag,
       does not constitute a change to the vector length for this purpose.
 
     * Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared.
       Calling PR_SME_SET_VL with vl equal to the thread's current vector
-      length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+      length, or calling PR_SME_SET_VL with the PR_SME_SET_VL_ONEXEC flag,
       does not constitute a change to the vector length for this purpose.
 
 

From 3fd97cf3234c5ffbd420319d1d510e4940183843 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Jan 2024 18:12:37 +0000
Subject: [PATCH 088/134] arm64/fp: Clarify effect of setting an unsupported
 system VL

The documentation for system vector length configuration does not cover all
cases where unsupported values are written, tighten it up.

Reported-by: Edmund Grimley-Evans <edmund.grimley-evans@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Link: https://lore.kernel.org/r/20240124-arm64-sve-sme-doc-v2-3-fe3964fb3c19@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/sme.rst | 5 ++---
 Documentation/arch/arm64/sve.rst | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index 3133d0e91b48..f4376c06f447 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -379,9 +379,8 @@ The regset data starts with struct user_za_header, containing:
 /proc/sys/abi/sme_default_vector_length
 
     Writing the text representation of an integer to this file sets the system
-    default vector length to the specified value, unless the value is greater
-    than the maximum vector length supported by the system in which case the
-    default vector length is set to that maximum.
+    default vector length to the specified value rounded to a supported value
+    using the same rules as for setting vector length via PR_SME_SET_VL.
 
     The result can be determined by reopening the file and reading its
     contents.
diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst
index b45a2da19bf1..8d8837fc39ec 100644
--- a/Documentation/arch/arm64/sve.rst
+++ b/Documentation/arch/arm64/sve.rst
@@ -423,9 +423,8 @@ The regset data starts with struct user_sve_header, containing:
 /proc/sys/abi/sve_default_vector_length
 
     Writing the text representation of an integer to this file sets the system
-    default vector length to the specified value, unless the value is greater
-    than the maximum vector length supported by the system in which case the
-    default vector length is set to that maximum.
+    default vector length to the specified value rounded to a supported value
+    using the same rules as for setting vector length via PR_SVE_SET_VL.
 
     The result can be determined by reopening the file and reading its
     contents.

From e47c18c3b25e63807005d0cb40f7743cb7512388 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 24 Jan 2024 18:12:38 +0000
Subject: [PATCH 089/134] arm64/sme: Remove spurious 'is' in SME documentation

Just a typographical error.

Reported-by: Edmund Grimley-Evans <edmund.grimley-evans@arm.com>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240124-arm64-sve-sme-doc-v2-4-fe3964fb3c19@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/sme.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index f4376c06f447..be317d457417 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -75,7 +75,7 @@ model features for SME is included in Appendix A.
 2.  Vector lengths
 ------------------
 
-SME defines a second vector length similar to the SVE vector length which is
+SME defines a second vector length similar to the SVE vector length which
 controls the size of the streaming mode SVE vectors and the ZA matrix array.
 The ZA matrix is square with each side having as many bytes as a streaming
 mode SVE vector.

From fdd867fe9b32c30b290aa10097f89daff09625cc Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 20 Feb 2024 08:02:03 +0530
Subject: [PATCH 090/134] arm64/sysreg: Add register fields for ID_AA64DFR1_EL1

This adds register fields for ID_AA64DFR1_EL1 as per the definitions based
on DDI0601 2023-12.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240220023203.3091229-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/tools/sysreg | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 4c9b67934367..dd693f992832 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1251,7 +1251,36 @@ EndEnum
 EndSysreg
 
 Sysreg	ID_AA64DFR1_EL1	3	0	0	5	1
-Res0	63:0
+Field	63:56	ABL_CMPs
+UnsignedEnum	55:52	DPFZS
+	0b0000	IGNR
+	0b0001	FRZN
+EndEnum
+UnsignedEnum	51:48	EBEP
+	0b0000	NI
+	0b0001	IMP
+EndEnum
+UnsignedEnum	47:44	ITE
+	0b0000	NI
+	0b0001	IMP
+EndEnum
+UnsignedEnum	43:40	ABLE
+	0b0000	NI
+	0b0001	IMP
+EndEnum
+UnsignedEnum	39:36	PMICNTR
+	0b0000	NI
+	0b0001	IMP
+EndEnum
+UnsignedEnum	35:32	SPMU
+	0b0000	NI
+	0b0001	IMP
+	0b0010	IMP_SPMZR
+EndEnum
+Field	31:24	CTX_CMPs
+Field	23:16	WRPs
+Field	15:8	BRPs
+Field	7:0	SYSPMUID
 EndSysreg
 
 Sysreg	ID_AA64AFR0_EL1	3	0	0	5	4

From 7accfaad89d76ca8750da916a242a4ba6a4f83db Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 20 Feb 2024 08:23:43 +0530
Subject: [PATCH 091/134] arm64/sysreg: Update ID_DFR0_EL1 register fields

This updates ID_DFR0_EL1.PerfMon and ID_DFR0_EL1.CopDbg register fields as
per the definitions based on DDI0601 2023-12.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240220025343.3093955-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/tools/sysreg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index dd693f992832..a9cab2b730a3 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -200,6 +200,7 @@ UnsignedEnum	27:24	PerfMon
 	0b0110	PMUv3p5
 	0b0111	PMUv3p7
 	0b1000	PMUv3p8
+	0b1001	PMUv3p9
 	0b1111	IMPDEF
 EndEnum
 Enum	23:20	MProfDbg
@@ -231,6 +232,7 @@ Enum	3:0	CopDbg
 	0b1000	Debugv8p2
 	0b1001	Debugv8p4
 	0b1010	Debugv8p8
+	0b1011	Debugv8p9
 EndEnum
 EndSysreg
 

From 358fee2917058eb06907f966cc69c11ea6c63e1f Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 20 Feb 2024 09:18:29 +0530
Subject: [PATCH 092/134] arm64/sysreg: Update ID_AA64DFR0_EL1 register

This updates ID_AA64DFR0_EL1.PMSVer and ID_AA64DFR0_EL1.DebugVer register
fields as per the definitions based on DDI0601 2023-12.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240220034829.3098373-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/tools/sysreg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index a9cab2b730a3..508224a0e078 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1223,6 +1223,7 @@ UnsignedEnum	35:32	PMSVer
 	0b0010	V1P1
 	0b0011	V1P2
 	0b0100	V1P3
+	0b0101	V1P4
 EndEnum
 Field	31:28	CTX_CMPs
 Res0	27:24
@@ -1249,6 +1250,7 @@ UnsignedEnum	3:0	DebugVer
 	0b1000	V8P2
 	0b1001	V8P4
 	0b1010	V8P8
+	0b1011	V8P9
 EndEnum
 EndSysreg
 

From bce79b0c8097ae8b9ad38d7cb8522b80a9b8ee00 Mon Sep 17 00:00:00 2001
From: Dawei Li <dawei.li@shingroup.cn>
Date: Fri, 2 Feb 2024 12:02:11 +0800
Subject: [PATCH 093/134] arm64: remove unneeded BUILD_BUG_ON assertion

Since commit c02433dd6de3 ("arm64: split thread_info from task stack"),
CONFIG_THREAD_INFO_IN_TASK is enabled unconditionally for arm64. So
remove this always-true assertion from arch_dup_task_struct.

Signed-off-by: Dawei Li <dawei.li@shingroup.cn>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240202040211.3118918-1-dawei.li@shingroup.cn
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/process.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 7387b68c745b..4ae31b7af6c3 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -290,9 +290,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 		fpsimd_preserve_current_state();
 	*dst = *src;
 
-	/* We rely on the above assignment to initialize dst's thread_flags: */
-	BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK));
-
 	/*
 	 * Detach src's sve_state (if any) from dst so that it does not
 	 * get erroneously used or freed prematurely.  dst's copies

From 58a0484eaf5ec1e94e530c1074abf852354eca8c Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Wed, 31 Jan 2024 01:55:04 +0800
Subject: [PATCH 094/134] arm64: make member of struct pt_regs and it's offset
 macro in the same order

In struct pt_regs, member pstate is after member pc. Move offset macro
of pstate after offset macro of pc to improve readability a little.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240130175504.106364-1-shikemeng@huaweicloud.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 5a7dbbe0ce63..81496083c041 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -75,8 +75,8 @@ int main(void)
   DEFINE(S_FP,			offsetof(struct pt_regs, regs[29]));
   DEFINE(S_LR,			offsetof(struct pt_regs, regs[30]));
   DEFINE(S_SP,			offsetof(struct pt_regs, sp));
-  DEFINE(S_PSTATE,		offsetof(struct pt_regs, pstate));
   DEFINE(S_PC,			offsetof(struct pt_regs, pc));
+  DEFINE(S_PSTATE,		offsetof(struct pt_regs, pstate));
   DEFINE(S_SYSCALLNO,		offsetof(struct pt_regs, syscallno));
   DEFINE(S_SDEI_TTBR1,		offsetof(struct pt_regs, sdei_ttbr1));
   DEFINE(S_PMR_SAVE,		offsetof(struct pt_regs, pmr_save));

From 21eb468e9fc11692952c42f86a44d07f94803d4f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 9 Feb 2024 16:53:36 +0000
Subject: [PATCH 095/134] arm64/sve: Document that __SVE_VQ_MAX is much larger
 than needed

__SVE_VQ_MAX is defined without comment as 512 but the actual
architectural maximum is 16, a substantial difference which might not
be obvious to readers especially given the several different units used
for specifying vector sizes in various contexts and the fact that it's
often used via macros.  In an effort to minimise surprises for users who
might assume the value is the architectural maximum and use it to do
things like size allocations add a comment noting the difference, and
add a note for SVE_VQ_MAX to aid discoverability.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Dave Martin <Dave.Martin@arm.com>
Link: https://lore.kernel.org/r/20240209-arm64-sve-vl-max-comment-v2-1-111b283469ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/sve_context.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/sve_context.h b/arch/arm64/include/uapi/asm/sve_context.h
index 754ab751b523..72aefc081061 100644
--- a/arch/arm64/include/uapi/asm/sve_context.h
+++ b/arch/arm64/include/uapi/asm/sve_context.h
@@ -13,6 +13,17 @@
 
 #define __SVE_VQ_BYTES		16	/* number of bytes per quadword */
 
+/*
+ * Yes, __SVE_VQ_MAX is 512 QUADWORDS.
+ *
+ * To help ensure forward portability, this is much larger than the
+ * current maximum value defined by the SVE architecture.  While arrays
+ * or static allocations can be sized based on this value, watch out!
+ * It will waste a surprisingly large amount of memory.
+ *
+ * Dynamic sizing based on the actual runtime vector length is likely to
+ * be preferable for most purposes.
+ */
 #define __SVE_VQ_MIN		1
 #define __SVE_VQ_MAX		512
 

From 2f0090549b649cc9fd61c0193189c25f8fc03119 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 13 Feb 2024 15:32:45 +0000
Subject: [PATCH 096/134] arm64/sve: Ensure that all fields in ZCR_EL1 are set
 to known values

At present nothing in our CPU initialisation code ever sets unknown fields
in ZCR_EL1 to known values, all updates to ZCR_EL1 are read/modify/write
sequences for LEN. All the unknown fields are RES0, explicitly initialise
them as such to avoid future surprises.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240213-arm64-fp-init-vec-cr-v1-1-7e7c2d584f26@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index a5dc6f764195..cc3c9ad877a8 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1134,6 +1134,8 @@ void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p)
 {
 	write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1);
 	isb();
+
+	write_sysreg_s(0, SYS_ZCR_EL1);
 }
 
 void __init sve_setup(void)

From 93576e34988757ed431e305ba9d3597e46dffe6b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 13 Feb 2024 15:32:46 +0000
Subject: [PATCH 097/134] arm64/sme: Ensure that all fields in SMCR_EL1 are set
 to known values

At present nothing in our CPU initialisation code ever sets unknown fields
in SMCR_EL1 to known values, all updates to SMCR_EL1 are read/modify/write
sequences. All the unknown fields are RES0, explicitly initialise them as
such to avoid future surprises.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240213-arm64-fp-init-vec-cr-v1-2-7e7c2d584f26@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index cc3c9ad877a8..f96907b813fa 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1247,6 +1247,9 @@ void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p)
 	write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1);
 	isb();
 
+	/* Ensure all bits in SMCR are set to known values */
+	write_sysreg_s(0, SYS_SMCR_EL1);
+
 	/* Allow EL0 to access TPIDR2 */
 	write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1);
 	isb();

From 2758269149250e2c28991fb8d216ffa40857ee0a Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 22 Feb 2024 22:04:41 +0100
Subject: [PATCH 098/134] arm64: gitignore: ignore relacheck

Add the generated executable for relacheck to the list of ignored files.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://lore.kernel.org/r/20240222210441.33142-1-brgl@bgdev.pl
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/pi/.gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 arch/arm64/kernel/pi/.gitignore

diff --git a/arch/arm64/kernel/pi/.gitignore b/arch/arm64/kernel/pi/.gitignore
new file mode 100644
index 000000000000..efb29b663e85
--- /dev/null
+++ b/arch/arm64/kernel/pi/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+relacheck

From a743f26d03a96593c0f3d05dc26b388f45de67c9 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Thu, 22 Feb 2024 22:40:29 -0800
Subject: [PATCH 099/134] arm64: ftrace: Don't forbid
 CALL_OPS+CC_OPTIMIZE_FOR_SIZE with Clang

Per commit b3f11af9b2ce ("arm64: ftrace: forbid CALL_OPS with
CC_OPTIMIZE_FOR_SIZE"), GCC is silently ignoring `-falign-functions=N`
when passed `-Os`, causing functions to be improperly aligned. This
doesn't seem to be a problem with Clang though, where enabling CALL_OPS
with CC_OPTIMIZE_FOR_SIZE doesn't spit out any warnings at boot about
misaligned patch-sites. Only forbid CALL_OPS if GCC is used and we're
optimizing for size so that CALL_OPS can be used with clang optimizing
for size.

Cc: Jason Ling <jasonling@chromium.org>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: llvm@lists.linux.dev
Fixes: b3f11af9b2ce ("arm64: ftrace: forbid CALL_OPS with CC_OPTIMIZE_FOR_SIZE")
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20240223064032.3463229-1-swboyd@chromium.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 430fabf20f17..b73d702e1e30 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -198,7 +198,7 @@ config ARM64
 		if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS
 	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \
 		if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG && \
-		    !CC_OPTIMIZE_FOR_SIZE)
+		    (CC_IS_CLANG || !CC_OPTIMIZE_FOR_SIZE))
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
 		if DYNAMIC_FTRACE_WITH_ARGS
 	select HAVE_SAMPLE_FTRACE_DIRECT

From 1984c805461f7fc4e96855eb4d94043ffb8f873d Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Tue, 9 Jan 2024 00:46:50 -0300
Subject: [PATCH 100/134] arm64: remove unnecessary ifdefs around
 is_compat_task()

Currently some parts of the codebase will test for CONFIG_COMPAT before
testing is_compat_task().

is_compat_task() is a inlined function only present on CONFIG_COMPAT.
On the other hand, for !CONFIG_COMPAT, we have in linux/compat.h:

 #define is_compat_task() (0)

Since we have this define available in every usage of is_compat_task() for
!CONFIG_COMPAT, it's unnecessary to keep the ifdefs, since the compiler is
smart enough to optimize-out those snippets on CONFIG_COMPAT=n

This requires some regset code as well as a few other defines to be made
available on !CONFIG_COMPAT, so some symbols can get resolved before
getting optimized-out.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20240109034651.478462-2-leobras@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/elf.h    | 10 +++++-----
 arch/arm64/include/asm/fpsimd.h |  2 --
 arch/arm64/kernel/ptrace.c      |  8 +++-----
 arch/arm64/kernel/syscall.c     |  5 +----
 4 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 97932fbf973d..3f93f4eef953 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -201,16 +201,16 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #define COMPAT_ELF_PLATFORM		("v8l")
 #endif
 
-#ifdef CONFIG_COMPAT
-
-/* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
-#define COMPAT_ELF_ET_DYN_BASE		0x000400000UL
-
 /* AArch32 registers. */
 #define COMPAT_ELF_NGREG		18
 typedef unsigned int			compat_elf_greg_t;
 typedef compat_elf_greg_t		compat_elf_gregset_t[COMPAT_ELF_NGREG];
 
+#ifdef CONFIG_COMPAT
+
+/* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
+#define COMPAT_ELF_ET_DYN_BASE		0x000400000UL
+
 /* AArch32 EABI. */
 #define EF_ARM_EABI_MASK		0xff000000
 int compat_elf_check_arch(const struct elf32_hdr *);
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 50e5f25d3024..94c7ed82fbad 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -21,7 +21,6 @@
 #include <linux/stddef.h>
 #include <linux/types.h>
 
-#ifdef CONFIG_COMPAT
 /* Masks for extracting the FPSR and FPCR from the FPSCR */
 #define VFP_FPSCR_STAT_MASK	0xf800009f
 #define VFP_FPSCR_CTRL_MASK	0x07f79f00
@@ -30,7 +29,6 @@
  * control/status register.
  */
 #define VFP_STATE_SIZE		((32 * 8) + 4)
-#endif
 
 static inline unsigned long cpacr_save_enable_kernel_sve(void)
 {
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index dc6cf0e37194..366fa248b968 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -174,7 +174,6 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 	struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp);
 	const char *desc = "Hardware breakpoint trap (ptrace)";
 
-#ifdef CONFIG_COMPAT
 	if (is_compat_task()) {
 		int si_errno = 0;
 		int i;
@@ -196,7 +195,7 @@ static void ptrace_hbptriggered(struct perf_event *bp,
 						  desc);
 		return;
 	}
-#endif
+
 	arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc);
 }
 
@@ -1596,7 +1595,6 @@ static const struct user_regset_view user_aarch64_view = {
 	.regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets)
 };
 
-#ifdef CONFIG_COMPAT
 enum compat_regset {
 	REGSET_COMPAT_GPR,
 	REGSET_COMPAT_VFP,
@@ -1853,6 +1851,7 @@ static const struct user_regset_view user_aarch32_ptrace_view = {
 	.regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets)
 };
 
+#ifdef CONFIG_COMPAT
 static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off,
 				   compat_ulong_t __user *ret)
 {
@@ -2114,7 +2113,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 {
-#ifdef CONFIG_COMPAT
 	/*
 	 * Core dumping of 32-bit tasks or compat ptrace requests must use the
 	 * user_aarch32_view compatible with arm32. Native ptrace requests on
@@ -2125,7 +2123,7 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 		return &user_aarch32_view;
 	else if (is_compat_thread(task_thread_info(task)))
 		return &user_aarch32_ptrace_view;
-#endif
+
 	return &user_aarch64_view;
 }
 
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index 9a70d9746b66..ad198262b981 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -20,14 +20,11 @@ long sys_ni_syscall(void);
 
 static long do_ni_syscall(struct pt_regs *regs, int scno)
 {
-#ifdef CONFIG_COMPAT
-	long ret;
 	if (is_compat_task()) {
-		ret = compat_arm_syscall(regs, scno);
+		long ret = compat_arm_syscall(regs, scno);
 		if (ret != -ENOSYS)
 			return ret;
 	}
-#endif
 
 	return sys_ni_syscall();
 }

From 6d1ce806e17fcabe91a912363cc1a5f108734627 Mon Sep 17 00:00:00 2001
From: Ryo Takakura <takakura@valinux.co.jp>
Date: Wed, 28 Feb 2024 11:28:36 +0900
Subject: [PATCH 101/134] arm64: Update setup_arch() comment on interrupt
 masking

DAIF_PROCCTX_NOIRQ contains the FIQ bit. Update the comment as only
asynchronous aborts are unmasked and FIQ is still masked.

Signed-off-by: Ryo Takakura <takakura@valinux.co.jp>
Link: https://lore.kernel.org/r/20240228022836.1756-1-takakura@valinux.co.jp
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/setup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 42c690bb2d60..ab43bfa85368 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -320,9 +320,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	dynamic_scs_init();
 
 	/*
-	 * Unmask asynchronous aborts and fiq after bringing up possible
-	 * earlycon. (Report possible System Errors once we can report this
-	 * occurred).
+	 * Unmask SError as soon as possible after initializing earlycon so
+	 * that we can report any SErrors immediately.
 	 */
 	local_daif_restore(DAIF_PROCCTX_NOIRQ);
 

From 3137db4c66bf70360ee7027af5c50662b3152046 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 1 Mar 2024 11:40:48 +0100
Subject: [PATCH 102/134] arm64/mm: Use generic __pud_free() helper in
 pud_free() implementation

Commit 0dd4f60a2c76 ("arm64: mm: Add support for folding PUDs at
runtime") implements specialized PUD alloc/free helpers to allow the
decision whether or not to fold PUDs to be made at runtime when the
number of paging levels is 4 or higher.

Its implementation of pud_free() is based on the generic version that
existed when the patch was first written, but in the meantime, the
freeing of a PUD has become a bit more involved, and so instead of
simply freeing the page, we should invoke the generic __pud_free() that
encapsulates whatever needs doing at this point.

This fixes a reported warning emitted by the page flags
self-diagnostics.

Reported-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Link: https://lore.kernel.org/r/20240301104046.1234309-5-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgalloc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index aeba2cf15a25..8ff5f2a2579e 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -60,8 +60,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
 	if (!pgtable_l4_enabled())
 		return;
-	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	free_page((unsigned long)pud);
+	__pud_free(mm, pud);
 }
 #else
 static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)

From 27f2b9fcddc76d542ac339febf2af55b67f610ca Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 1 Mar 2024 11:40:49 +0100
Subject: [PATCH 103/134] arm64/mm: Avoid ID mapping of kpti flag if it is no
 longer needed

arm64_use_ng_mappings will be set to 'true' by the early boot code if it
decides to use non-global (nG) attributes for all kernel mappings,
typically when enabling KASLR on a system that does not implement E0PD.

In this case, the G-to-nG update routines are never called, and so there
is no reason to create the writable mapping of the associated status
flag in the ID map.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240301104046.1234309-6-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b131ed31a6c8..bf5b1c426ad0 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -777,7 +777,7 @@ static void __init create_idmap(void)
 		       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
 		       __phys_to_virt(ptep) - ptep);
 
-	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
+	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
 		extern u32 __idmap_kpti_flag;
 		u64 pa = __pa_symbol(&__idmap_kpti_flag);
 

From 622442666dcca0f273fd8b1adf80cd1893ed88cf Mon Sep 17 00:00:00 2001
From: Liao Chang <liaochang1@huawei.com>
Date: Thu, 29 Feb 2024 10:52:08 +0000
Subject: [PATCH 104/134] arm64: cpufeatures: Clean up temporary variable to
 simplify code

Clean up one temporary variable to simplifiy code in capability
detection.

Signed-off-by: Liao Chang <liaochang1@huawei.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240229105208.456704-1-liaochang1@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8d1a634a403e..0e900b23f7ab 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3052,13 +3052,9 @@ static void __init enable_cpu_capabilities(u16 scope_mask)
 	boot_scope = !!(scope_mask & SCOPE_BOOT_CPU);
 
 	for (i = 0; i < ARM64_NCAPS; i++) {
-		unsigned int num;
-
 		caps = cpucap_ptrs[i];
-		if (!caps || !(caps->type & scope_mask))
-			continue;
-		num = caps->capability;
-		if (!cpus_have_cap(num))
+		if (!caps || !(caps->type & scope_mask) ||
+		    !cpus_have_cap(caps->capability))
 			continue;
 
 		if (boot_scope && caps->cpu_enable)

From 9d6b6789c8787fb1183d176a00569fb9b192243d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 29 Feb 2024 14:04:31 +0530
Subject: [PATCH 105/134] arm64/hw_breakpoint: Directly use ESR_ELx_WNR for an
 watchpoint exception

Let's use existing ISS encoding for an watchpoint exception i.e ESR_ELx_WNR
This represents an instruction's either writing to or reading from a memory
location during an watchpoint exception. While here this drops non-standard
macro AARCH64_ESR_ACCESS_MASK.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240229083431.356578-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/hw_breakpoint.h | 1 -
 arch/arm64/kernel/hw_breakpoint.c      | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index 84055329cd8b..bd81cf17744a 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -59,7 +59,6 @@ static inline void decode_ctrl_reg(u32 reg,
 /* Watchpoints */
 #define ARM_BREAKPOINT_LOAD	1
 #define ARM_BREAKPOINT_STORE	2
-#define AARCH64_ESR_ACCESS_MASK	(1 << 6)
 
 /* Lengths */
 #define ARM_BREAKPOINT_LEN_1	0x1
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index 35225632d70a..2f5755192c2b 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -21,6 +21,7 @@
 
 #include <asm/current.h>
 #include <asm/debug-monitors.h>
+#include <asm/esr.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
 #include <asm/cputype.h>
@@ -779,7 +780,7 @@ static int watchpoint_handler(unsigned long addr, unsigned long esr,
 		 * Check that the access type matches.
 		 * 0 => load, otherwise => store
 		 */
-		access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W :
+		access = (esr & ESR_ELx_WNR) ? HW_BREAKPOINT_W :
 			 HW_BREAKPOINT_R;
 		if (!(access & hw_breakpoint_type(wp)))
 			continue;

From 527db67a4d85a3400465e6d3685170a24d1deeba Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Thu, 29 Feb 2024 21:28:01 +0800
Subject: [PATCH 106/134] arm64: Remove enable_daif macro

Since commit bb8e93a287a5 ("arm64: entry: convert SError handlers to C"),
the enable_daif assembler macro is no longer used anywhere, so remove it.

Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240229132802.1682026-2-ruanjinjie@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 513787e43329..96b18a707507 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -38,10 +38,6 @@
 	msr	daifset, #0xf
 	.endm
 
-	.macro enable_daif
-	msr	daifclr, #0xf
-	.endm
-
 /*
  * Save/restore interrupts.
  */

From e10b6976f6b9afdf3564f88c851e42d139bb19c0 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Tue, 27 Feb 2024 20:52:31 +0800
Subject: [PATCH 107/134] drivers/perf: hisi: Enable HiSilicon Erratum
 162700402 quirk for HIP09

HiSilicon UC PMU v2 suffers the erratum 162700402 that the PMU counter
cannot be set due to the lack of clock under power saving mode. This will
lead to error or inaccurate counts. The clock can be enabled by the PMU
global enabling control.

This patch tries to fix this by set the UC PMU enable before set event
period to turn on the clock, and then restore the UC PMU configuration.
The counter register can hold its value without a clock.

Signed-off-by: Junhao He <hejunhao3@huawei.com>
Reviewed-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240227125231.53127-1-hejunhao3@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_uc_pmu.c | 42 ++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c
index 636fb79647c8..481dcc9e8fbf 100644
--- a/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c
@@ -287,12 +287,52 @@ static u64 hisi_uc_pmu_read_counter(struct hisi_pmu *uc_pmu,
 	return readq(uc_pmu->base + HISI_UC_CNTR_REGn(hwc->idx));
 }
 
-static void hisi_uc_pmu_write_counter(struct hisi_pmu *uc_pmu,
+static bool hisi_uc_pmu_get_glb_en_state(struct hisi_pmu *uc_pmu)
+{
+	u32 val;
+
+	val = readl(uc_pmu->base + HISI_UC_EVENT_CTRL_REG);
+	return !!FIELD_GET(HISI_UC_EVENT_GLB_EN, val);
+}
+
+static void hisi_uc_pmu_write_counter_normal(struct hisi_pmu *uc_pmu,
 				      struct hw_perf_event *hwc, u64 val)
 {
 	writeq(val, uc_pmu->base + HISI_UC_CNTR_REGn(hwc->idx));
 }
 
+static void hisi_uc_pmu_write_counter_quirk_v2(struct hisi_pmu *uc_pmu,
+				      struct hw_perf_event *hwc, u64 val)
+{
+	hisi_uc_pmu_start_counters(uc_pmu);
+	hisi_uc_pmu_write_counter_normal(uc_pmu, hwc, val);
+	hisi_uc_pmu_stop_counters(uc_pmu);
+}
+
+static void hisi_uc_pmu_write_counter(struct hisi_pmu *uc_pmu,
+				      struct hw_perf_event *hwc, u64 val)
+{
+	bool enable = hisi_uc_pmu_get_glb_en_state(uc_pmu);
+	bool erratum = uc_pmu->identifier == HISI_PMU_V2;
+
+	/*
+	 * HiSilicon UC PMU v2 suffers the erratum 162700402 that the
+	 * PMU counter cannot be set due to the lack of clock under power
+	 * saving mode. This will lead to error or inaccurate counts.
+	 * The clock can be enabled by the PMU global enabling control.
+	 * The irq handler and pmu_start() will call the function to set
+	 * period. If the function under irq context, the PMU has been
+	 * enabled therefore we set counter directly. Other situations
+	 * the PMU is disabled, we need to enable it to turn on the
+	 * counter clock to set period, and then restore PMU enable
+	 * status, the counter can hold its value without a clock.
+	 */
+	if (enable || !erratum)
+		hisi_uc_pmu_write_counter_normal(uc_pmu, hwc, val);
+	else
+		hisi_uc_pmu_write_counter_quirk_v2(uc_pmu, hwc, val);
+}
+
 static void hisi_uc_pmu_enable_counter_int(struct hisi_pmu *uc_pmu,
 					   struct hw_perf_event *hwc)
 {

From 54a9e47eebb9064de9c65a6c22bb31e1a67f3903 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 23 Feb 2024 18:33:52 +0800
Subject: [PATCH 108/134] drivers/perf: hisi_pcie: Rename
 hisi_pcie_pmu_{config,clear}_filter()

hisi_pcie_pmu_{config,clear}_filter() are config/clear HISI_PCIE_EVENT_CTRL
register which contains not only the filter but also the event code. The
function names are bit misleading. Rename it to
hisi_pcie_pmu_{config,clear}_event_ctrl() to reflects their functions
more accurately.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-2-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index b90ba8aca3fa..9760ddde46fd 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -216,7 +216,7 @@ static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
 	writeq_relaxed(val, pcie_pmu->base + offset);
 }
 
-static void hisi_pcie_pmu_config_filter(struct perf_event *event)
+static void hisi_pcie_pmu_config_event_ctrl(struct perf_event *event)
 {
 	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
@@ -259,7 +259,7 @@ static void hisi_pcie_pmu_config_filter(struct perf_event *event)
 	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg);
 }
 
-static void hisi_pcie_pmu_clear_filter(struct perf_event *event)
+static void hisi_pcie_pmu_clear_event_ctrl(struct perf_event *event)
 {
 	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
@@ -505,7 +505,7 @@ static void hisi_pcie_pmu_start(struct perf_event *event, int flags)
 	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
 	hwc->state = 0;
 
-	hisi_pcie_pmu_config_filter(event);
+	hisi_pcie_pmu_config_event_ctrl(event);
 	hisi_pcie_pmu_enable_counter(pcie_pmu, hwc);
 	hisi_pcie_pmu_enable_int(pcie_pmu, hwc);
 	hisi_pcie_pmu_set_period(event);
@@ -526,7 +526,7 @@ static void hisi_pcie_pmu_stop(struct perf_event *event, int flags)
 	hisi_pcie_pmu_event_update(event);
 	hisi_pcie_pmu_disable_int(pcie_pmu, hwc);
 	hisi_pcie_pmu_disable_counter(pcie_pmu, hwc);
-	hisi_pcie_pmu_clear_filter(event);
+	hisi_pcie_pmu_clear_event_ctrl(event);
 	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 	hwc->state |= PERF_HES_STOPPED;
 

From 4d473461e0948645efa82b4c025d014f40c373ff Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 23 Feb 2024 18:33:53 +0800
Subject: [PATCH 109/134] drivers/perf: hisi_pcie: Introduce
 hisi_pcie_pmu_get_event_ctrl_val()

Factor out retrieving of the register value for the
corresponding event from hisi_pcie_config_event_ctrl() into a
new function hisi_pcie_pmu_get_event_ctrl_val() allowing future
reuse.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-3-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 9760ddde46fd..2468cf3b007c 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -216,10 +216,8 @@ static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
 	writeq_relaxed(val, pcie_pmu->base + offset);
 }
 
-static void hisi_pcie_pmu_config_event_ctrl(struct perf_event *event)
+static u64 hisi_pcie_pmu_get_event_ctrl_val(struct perf_event *event)
 {
-	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
-	struct hw_perf_event *hwc = &event->hw;
 	u64 port, trig_len, thr_len, len_mode;
 	u64 reg = HISI_PCIE_INIT_SET;
 
@@ -256,6 +254,15 @@ static void hisi_pcie_pmu_config_event_ctrl(struct perf_event *event)
 	else
 		reg |= FIELD_PREP(HISI_PCIE_LEN_M, HISI_PCIE_LEN_M_DEFAULT);
 
+	return reg;
+}
+
+static void hisi_pcie_pmu_config_event_ctrl(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 reg = hisi_pcie_pmu_get_event_ctrl_val(event);
+
 	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg);
 }
 

From b6693ad68e2725a61d628f077e75eb3c31b9ea44 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 23 Feb 2024 18:33:54 +0800
Subject: [PATCH 110/134] drivers/perf: hisi_pcie: Fix incorrect counting under
 metric mode

The metric counting shows incorrect results if the events in the
metric group using the same event but different filter options.
This is because we only judge the event code to decide whether
the event in the metric group should share the same hardware
counter, but ignore the settings of the filter.

For example, on a platform of 2 ports 0x1 and 0x2 but only port
0x1 has a downstream PCIe NVME device. The metric counting
shows both ports have the same counts because we misassign these
two events to one same hardware counter:
[root@localhost perf-iostat]# ./perf stat -e '{hisi_pcie0_core1/event=0x0104,port=0x2/,hisi_pcie0_core1/event=0x0104,port=0x1/}'

 Performance counter stats for 'system wide':

        7907484924      hisi_pcie0_core1/event=0x0104,port=0x2/
        7907484924      hisi_pcie0_core1/event=0x0104,port=0x1/

      10.153863691 seconds time elapsed

Fix this by using the whole config rather than the event only
to judge whether two events are the same and should share the
same hardware counter. With this patch, the metric counting in
the above case tends to be corrected:

[root@localhost perf-iostat]# ./perf stat -e '{hisi_pcie0_core1/event=0x0104,port=0x2/,hisi_pcie0_core1/event=0x0104,port=0x1/}'

 Performance counter stats for 'system wide':

                 0      hisi_pcie0_core1/event=0x0104,port=0x2/
        8123122077      hisi_pcie0_core1/event=0x0104,port=0x1/

      10.152875631 seconds time elapsed

Fixes: 8404b0fbc7fb ("drivers/perf: hisi: Add driver for HiSilicon PCIe PMU")
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-4-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 2468cf3b007c..9176242eadb3 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -314,10 +314,16 @@ static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
 	return true;
 }
 
+/*
+ * Check Whether two events share the same config. The same config means not
+ * only the event code, but also the filter settings of the two events are
+ * the same.
+ */
 static bool hisi_pcie_pmu_cmp_event(struct perf_event *target,
 					struct perf_event *event)
 {
-	return hisi_pcie_get_real_event(target) == hisi_pcie_get_real_event(event);
+	return hisi_pcie_pmu_get_event_ctrl_val(target) ==
+	       hisi_pcie_pmu_get_event_ctrl_val(event);
 }
 
 static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)

From 00ca69b856ba5ff0dab241bafe7119cd08348a92 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Fri, 23 Feb 2024 18:33:55 +0800
Subject: [PATCH 111/134] drivers/perf: hisi_pcie: Add more events for counting
 TLP bandwidth

A typical PCIe transaction is consisted of various TLP packets in both
direction. For counting bandwidth only memory read events are exported
currently. Add memory write and completion counting events of both
direction to complete the bandwidth counting.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-5-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 9176242eadb3..6f39cb82661e 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -727,10 +727,18 @@ static struct attribute *hisi_pcie_pmu_events_attr[] = {
 	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_cnt, 0x10210),
 	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_latency, 0x0011),
 	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_cnt, 0x10011),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_flux, 0x0104),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_time, 0x10104),
 	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_flux, 0x0804),
 	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_time, 0x10804),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_cpl_flux, 0x2004),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_cpl_time, 0x12004),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mwr_flux, 0x0105),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mwr_time, 0x10105),
 	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_flux, 0x0405),
 	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_time, 0x10405),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_cpl_flux, 0x1005),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_cpl_time, 0x11005),
 	NULL
 };
 

From 2f864fee085190f6a9c114f94affa0bdc2970f16 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Fri, 23 Feb 2024 18:33:56 +0800
Subject: [PATCH 112/134] drivers/perf: hisi_pcie: Check the target filter
 properly

The PMU can monitor traffic of certain target Root Port or downstream
target Endpoint. User can specify the target filter by the "port" or
"bdf" option respectively. The PMU can only monitor the Root Port or
Endpoint on the same PCIe core so the value of "port" or "bdf" should
be valid and will be checked by the driver.

Currently at least and only one of "port" and "bdf" option must be set.
If "port" filter is not set or is set explicitly to zero (default),
driver will regard the user specifies a "bdf" option since "port" option
is a bitmask of the target Root Ports and zero is not a valid
value.

If user not explicitly set "port" or "bdf" filter, the driver uses "bdf"
default value (zero) to set target filter, but driver will skip the
check of bdf=0, although it's a valid value (meaning 0000:000:00.0).
Then the user just gets zero.

Therefore, we need to check if both "port" and "bdf" are invalid, then
return failure and report warning.

Testing:
before the patch:
                   0      hisi_pcie0_core1/rx_mrd_flux/
                   0      hisi_pcie0_core1/rx_mrd_flux,port=0/
              24,124      hisi_pcie0_core1/rx_mrd_flux,port=1/
                   0      hisi_pcie0_core1/rx_mrd_flux,bdf=0/
                   0      hisi_pcie0_core1/rx_mrd_flux,port=0x800/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,bdf=1/
              24,132      hisi_pcie0_core1/rx_mrd_flux,bdf=0x1700/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,port=0x0,bdf=0x0/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,port=0x0,bdf=0x1/
              24,138      hisi_pcie0_core1/rx_mrd_flux,port=0x0,bdf=0x1700/
              24,126      hisi_pcie0_core1/rx_mrd_flux,port=0x1,bdf=0x0/

after the patch:
     <not supported>      hisi_pcie0_core1/rx_mrd_flux/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,port=0/
              24,153      hisi_pcie0_core1/rx_mrd_flux,port=1/
                   0      hisi_pcie0_core1/rx_mrd_flux,port=0x800/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,bdf=0/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,bdf=1/
              24,117      hisi_pcie0_core1/rx_mrd_flux,bdf=0x1700/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,port=0x0,bdf=0x0/
     <not supported>      hisi_pcie0_core1/rx_mrd_flux,port=0x0,bdf=0x1/
              24,120      hisi_pcie0_core1/rx_mrd_flux,port=0x0,bdf=0x1700/
              24,123      hisi_pcie0_core1/rx_mrd_flux,port=0x1,bdf=0x0/

Signed-off-by: Junhao He <hejunhao3@huawei.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-6-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 6f39cb82661e..b2dde7559639 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -306,10 +306,10 @@ static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
 	if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL)
 		return false;
 
-	if (requester_id) {
-		if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
-			return false;
-	}
+	/* Need to explicitly set filter of "port" or "bdf" */
+	if (!hisi_pcie_get_port(event) &&
+	    !hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
+		return false;
 
 	return true;
 }

From 2fbf96ed883adcdf0f641cfe07e695dac7e5d540 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Fri, 23 Feb 2024 18:33:57 +0800
Subject: [PATCH 113/134] drivers/perf: hisi_pcie: Relax the check on related
 events

If we use two events with the same filter and related event type
(see the following example), the driver check whether they are related
events and are in the same group, otherwise the function
hisi_pcie_pmu_find_related_event() return -EINVAL, then the 2nd event
cannot count but the 1st event is running, although the PCIe PMU has
other idle counters.

In this case, The perf event scheduler will make the two events to
multiplex a counter, if the user use the formula
(1st event_value / 2nd event_value) to calculate the bandwidth, he/she
won't get the correct value, because they are not counting at the
same period.

This patch tries to fix this by making the related events to use
different idle counters if they are not in the same event group.

And finally, I'm going to say. The related events are best used in the
same group [1]. There are two ways to know if they are related events.
a) By event name, such as the latency events "xxx_latency, xxx_cnt" or
bandwidth events "xxx_flux, xxx_time".
b) By event type, such as "event=0xXXXX, event=0x1XXXX".

Use group to count the related events:
  [1] -e "{pmu_name/xxx_latency,port=1/,pmu_name/xxx_cnt,port=1/}"

  example:
    1st event: hisi_pcie0_core1/event=0x804,port=1
    2nd event: hisi_pcie0_core1/event=0x10804,port=1

  test cmd:
    perf stat -e hisi_pcie0_core1/event=0x804,port=1/ \
               -e hisi_pcie0_core1/event=0x10804,port=1/

  before patch:
            25,281      hisi_pcie0_core1/event=0x804,port=1/    (49.91%)
           470,598      hisi_pcie0_core1/event=0x10804,port=1/    (50.09%)

  after patch:
            24,147      hisi_pcie0_core1/event=0x804,port=1/
           474,558      hisi_pcie0_core1/event=0x10804,port=1/

Signed-off-by: Junhao He <hejunhao3@huawei.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huwei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-7-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index b2dde7559639..5b15f3698188 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -409,14 +409,10 @@ static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu,
 		if (!sibling)
 			continue;
 
-		if (!hisi_pcie_pmu_cmp_event(sibling, event))
-			continue;
-
 		/* Related events must be used in group */
-		if (sibling->group_leader == event->group_leader)
+		if (hisi_pcie_pmu_cmp_event(sibling, event) &&
+		    sibling->group_leader == event->group_leader)
 			return idx;
-		else
-			return -EINVAL;
 	}
 
 	return idx;

From 7da377059ee653dd4ddcc126fd26c9c78f7bc4e7 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Fri, 23 Feb 2024 18:33:58 +0800
Subject: [PATCH 114/134] drivers/perf: hisi_pcie: Merge find_related_event()
 and get_event_idx()

The function xxx_find_related_event() scan all working events to find
related events. During this process, we also can find the idle counters.
If not found related events, return the first idle counter to simplify
the code.

Signed-off-by: Junhao He <hejunhao3@huawei.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-8-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 51 ++++++++++----------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 5b15f3698188..5d1f0e9fdb08 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -398,16 +398,24 @@ static u64 hisi_pcie_pmu_read_counter(struct perf_event *event)
 	return hisi_pcie_pmu_readq(pcie_pmu, event->hw.event_base, idx);
 }
 
-static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu,
-					    struct perf_event *event)
+/*
+ * Check all work events, if a relevant event is found then we return it
+ * first, otherwise return the first idle counter (need to reset).
+ */
+static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu,
+					struct perf_event *event)
 {
+	int first_idle = -EAGAIN;
 	struct perf_event *sibling;
 	int idx;
 
 	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
 		sibling = pcie_pmu->hw_events[idx];
-		if (!sibling)
+		if (!sibling) {
+			if (first_idle == -EAGAIN)
+				first_idle = idx;
 			continue;
+		}
 
 		/* Related events must be used in group */
 		if (hisi_pcie_pmu_cmp_event(sibling, event) &&
@@ -415,19 +423,7 @@ static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu,
 			return idx;
 	}
 
-	return idx;
-}
-
-static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu)
-{
-	int idx;
-
-	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
-		if (!pcie_pmu->hw_events[idx])
-			return idx;
-	}
-
-	return -EINVAL;
+	return first_idle;
 }
 
 static void hisi_pcie_pmu_event_update(struct perf_event *event)
@@ -553,27 +549,18 @@ static int hisi_pcie_pmu_add(struct perf_event *event, int flags)
 
 	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
 
-	/* Check all working events to find a related event. */
-	idx = hisi_pcie_pmu_find_related_event(pcie_pmu, event);
-	if (idx < 0)
-		return idx;
-
-	/* Current event shares an enabled counter with the related event */
-	if (idx < HISI_PCIE_MAX_COUNTERS) {
-		hwc->idx = idx;
-		goto start_count;
-	}
-
-	idx = hisi_pcie_pmu_get_event_idx(pcie_pmu);
+	idx = hisi_pcie_pmu_get_event_idx(pcie_pmu, event);
 	if (idx < 0)
 		return idx;
 
 	hwc->idx = idx;
-	pcie_pmu->hw_events[idx] = event;
-	/* Reset Counter to avoid previous statistic interference. */
-	hisi_pcie_pmu_reset_counter(pcie_pmu, idx);
 
-start_count:
+	/* No enabled counter found with related event, reset it */
+	if (!pcie_pmu->hw_events[idx]) {
+		hisi_pcie_pmu_reset_counter(pcie_pmu, idx);
+		pcie_pmu->hw_events[idx] = event;
+	}
+
 	if (flags & PERF_EF_START)
 		hisi_pcie_pmu_start(event, PERF_EF_RELOAD);
 

From 89a032923d4ba23907405744aa86265822f057f8 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Fri, 23 Feb 2024 18:33:59 +0800
Subject: [PATCH 115/134] docs: perf: Update usage for target filter of
 hisi-pcie-pmu

One of the "port" and "bdf" target filter interface must be set, and
the related events should preferably used in the same group.
Update the usage in the documentation.

Signed-off-by: Junhao He <hejunhao3@huawei.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240223103359.18669-9-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../admin-guide/perf/hisi-pcie-pmu.rst        | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
index 7e863662e2d4..678d3865560c 100644
--- a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
+++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@ -37,9 +37,20 @@ Example usage of perf::
   hisi_pcie0_core0/rx_mwr_cnt/ [kernel PMU event]
   ------------------------------------------
 
-  $# perf stat -e hisi_pcie0_core0/rx_mwr_latency/
-  $# perf stat -e hisi_pcie0_core0/rx_mwr_cnt/
-  $# perf stat -g -e hisi_pcie0_core0/rx_mwr_latency/ -e hisi_pcie0_core0/rx_mwr_cnt/
+  $# perf stat -e hisi_pcie0_core0/rx_mwr_latency,port=0xffff/
+  $# perf stat -e hisi_pcie0_core0/rx_mwr_cnt,port=0xffff/
+
+The related events usually used to calculate the bandwidth, latency or others.
+They need to start and end counting at the same time, therefore related events
+are best used in the same event group to get the expected value. There are two
+ways to know if they are related events:
+a) By event name, such as the latency events "xxx_latency, xxx_cnt" or
+   bandwidth events "xxx_flux, xxx_time".
+b) By event type, such as "event=0xXXXX, event=0x1XXXX".
+
+Example usage of perf group::
+
+  $# perf stat -e "{hisi_pcie0_core0/rx_mwr_latency,port=0xffff/,hisi_pcie0_core0/rx_mwr_cnt,port=0xffff/}"
 
 The current driver does not support sampling. So "perf record" is unsupported.
 Also attach to a task is unsupported for PCIe PMU.
@@ -51,8 +62,12 @@ Filter options
 
    PMU could only monitor the performance of traffic downstream target Root
    Ports or downstream target Endpoint. PCIe PMU driver support "port" and
-   "bdf" interfaces for users, and these two interfaces aren't supported at the
-   same time.
+   "bdf" interfaces for users.
+   Please notice that, one of these two interfaces must be set, and these two
+   interfaces aren't supported at the same time. If they are both set, only
+   "port" filter is valid.
+   If "port" filter not being set or is set explicitly to zero (default), the
+   "bdf" filter will be in effect, because "bdf=0" meaning 0000:000:00.0.
 
    - port
 
@@ -95,7 +110,7 @@ Filter options
 
    Example usage of perf::
 
-     $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
+     $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,port=0xffff,trig_len=0x4,trig_mode=1/ sleep 5
 
 3. Threshold filter
 
@@ -109,7 +124,7 @@ Filter options
 
    Example usage of perf::
 
-     $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5
+     $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,port=0xffff,thr_len=0x4,thr_mode=1/ sleep 5
 
 4. TLP Length filter
 
@@ -127,4 +142,4 @@ Filter options
 
    Example usage of perf::
 
-     $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,len_mode=0x1/ sleep 5
+     $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,port=0xffff,len_mode=0x1/ sleep 5

From c2b24812f7bc5fbd6f2f92af070856fbe4c37b40 Mon Sep 17 00:00:00 2001
From: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Date: Thu, 29 Feb 2024 15:27:17 +0800
Subject: [PATCH 116/134] perf: starfive: Add StarLink PMU support

This patch adds support for StarFive's StarLink PMU (Performance
Monitor Unit). StarLink PMU integrates one or more CPU cores with
a shared L3 memory system. The PMU supports overflow interrupt,
up to 16 programmable 64bit event counters, and an independent
64bit cycle counter. StarLink PMU is accessed via MMIO.

Example Perf stat output:
[root@user]# perf stat -a -e /starfive_starlink_pmu/cycles/ \
	-e /starfive_starlink_pmu/read_miss/ \
	-e /starfive_starlink_pmu/read_hit/ \
	-e /starfive_starlink_pmu/release_request/  \
	-e /starfive_starlink_pmu/write_hit/ \
	-e /starfive_starlink_pmu/write_miss/ \
	-e /starfive_starlink_pmu/write_request/ \
	-e /starfive_starlink_pmu/writeback/ \
	-e /starfive_starlink_pmu/read_request/ \
	-- openssl speed rsa2048
Doing 2048 bits private rsa's for 10s: 5 2048 bits private RSA's in
2.84s
Doing 2048 bits public rsa's for 10s: 169 2048 bits public RSA's in
2.42s
version: 3.0.11
built on: Tue Sep 19 13:02:31 2023 UTC
options: bn(64,64)
CPUINFO: N/A
                  sign    verify    sign/s verify/s
rsa 2048 bits 0.568000s 0.014320s      1.8     69.8
/////////
 Performance counter stats for 'system wide':

         649991998      starfive_starlink_pmu/cycles/
           1009690      starfive_starlink_pmu/read_miss/
           1079750      starfive_starlink_pmu/read_hit/
           2089405      starfive_starlink_pmu/release_request/
               129      starfive_starlink_pmu/write_hit/
                70      starfive_starlink_pmu/write_miss/
               194      starfive_starlink_pmu/write_request/
            150080      starfive_starlink_pmu/writeback/
           2089423      starfive_starlink_pmu/read_request/

      27.062755678 seconds time elapsed

Signed-off-by: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Link: https://lore.kernel.org/r/20240229072720.3987876-2-jisheng.teoh@starfivetech.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/Kconfig                 |   9 +
 drivers/perf/Makefile                |   1 +
 drivers/perf/starfive_starlink_pmu.c | 642 +++++++++++++++++++++++++++
 3 files changed, 652 insertions(+)
 create mode 100644 drivers/perf/starfive_starlink_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index ec6e0d9194a1..2774b18f4e27 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -86,6 +86,15 @@ config RISCV_PMU_SBI
 	  full perf feature support i.e. counter overflow, privilege mode
 	  filtering, counter configuration.
 
+config STARFIVE_STARLINK_PMU
+	depends on ARCH_STARFIVE || COMPILE_TEST
+	bool "StarFive StarLink PMU"
+	help
+	   Provide support for StarLink Performance Monitor Unit.
+	   StarLink Performance Monitor Unit integrates one or more cores with
+	   an L3 memory system. The L3 cache events are added into perf event
+	   subsystem, allowing monitoring of various L3 cache perf events.
+
 config ARM_PMU_ACPI
 	depends on ARM_PMU && ACPI
 	def_bool y
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index a06338e3401c..29b1c28203ef 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_RISCV_PMU) += riscv_pmu.o
 obj-$(CONFIG_RISCV_PMU_LEGACY) += riscv_pmu_legacy.o
 obj-$(CONFIG_RISCV_PMU_SBI) += riscv_pmu_sbi.o
+obj-$(CONFIG_STARFIVE_STARLINK_PMU) += starfive_starlink_pmu.o
 obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
diff --git a/drivers/perf/starfive_starlink_pmu.c b/drivers/perf/starfive_starlink_pmu.c
new file mode 100644
index 000000000000..5e5a672b4229
--- /dev/null
+++ b/drivers/perf/starfive_starlink_pmu.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * StarFive's StarLink PMU driver
+ *
+ * Copyright (C) 2023 StarFive Technology Co., Ltd.
+ *
+ * Author: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
+ *
+ */
+
+#define STARLINK_PMU_PDEV_NAME	"starfive_starlink_pmu"
+#define pr_fmt(fmt)	STARLINK_PMU_PDEV_NAME ": " fmt
+
+#include <linux/bitmap.h>
+#include <linux/cpu_pm.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/sysfs.h>
+
+#define STARLINK_PMU_MAX_COUNTERS			64
+#define STARLINK_PMU_NUM_COUNTERS			16
+#define STARLINK_PMU_IDX_CYCLE_COUNTER			63
+
+#define STARLINK_PMU_EVENT_SELECT			0x060
+#define STARLINK_PMU_EVENT_COUNTER			0x160
+#define STARLINK_PMU_COUNTER_MASK			GENMASK_ULL(63, 0)
+#define STARLINK_PMU_CYCLE_COUNTER			0x058
+
+#define STARLINK_PMU_CONTROL				0x040
+#define STARLINK_PMU_GLOBAL_ENABLE			BIT_ULL(0)
+
+#define STARLINK_PMU_INTERRUPT_ENABLE			0x050
+#define STARLINK_PMU_COUNTER_OVERFLOW_STATUS		0x048
+#define STARLINK_PMU_CYCLE_OVERFLOW_MASK		BIT_ULL(63)
+
+#define STARLINK_CYCLES				0x058
+#define CACHE_READ_REQUEST			0x04000701
+#define CACHE_WRITE_REQUEST			0x03000001
+#define CACHE_RELEASE_REQUEST			0x0003e001
+#define CACHE_READ_HIT				0x00901202
+#define CACHE_READ_MISS				0x04008002
+#define CACHE_WRITE_HIT				0x006c0002
+#define CACHE_WRITE_MISS			0x03000002
+#define CACHE_WRITEBACK				0x00000403
+
+#define to_starlink_pmu(p) (container_of(p, struct starlink_pmu, pmu))
+
+#define STARLINK_FORMAT_ATTR(_name, _config)				      \
+	(&((struct dev_ext_attribute[]) {				      \
+		{ .attr = __ATTR(_name, 0444, starlink_pmu_sysfs_format_show, NULL), \
+		  .var = (void *)_config, }				      \
+	})[0].attr.attr)
+
+#define STARLINK_EVENT_ATTR(_name, _id)					     \
+	PMU_EVENT_ATTR_ID(_name, starlink_pmu_sysfs_event_show, _id)
+
+static int starlink_pmu_cpuhp_state;
+
+struct starlink_hw_events {
+	struct perf_event	*events[STARLINK_PMU_MAX_COUNTERS];
+	DECLARE_BITMAP(used_mask, STARLINK_PMU_MAX_COUNTERS);
+};
+
+struct starlink_pmu {
+	struct pmu					pmu;
+	struct starlink_hw_events			__percpu *hw_events;
+	struct hlist_node				node;
+	struct notifier_block				starlink_pmu_pm_nb;
+	void __iomem					*pmu_base;
+	cpumask_t					cpumask;
+	int						irq;
+};
+
+static ssize_t
+starlink_pmu_sysfs_format_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct dev_ext_attribute *eattr = container_of(attr,
+						       struct dev_ext_attribute, attr);
+
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static struct attribute *starlink_pmu_format_attrs[] = {
+	STARLINK_FORMAT_ATTR(event, "config:0-31"),
+	NULL
+};
+
+static const struct attribute_group starlink_pmu_format_attr_group = {
+	.name = "format",
+	.attrs = starlink_pmu_format_attrs,
+};
+
+static ssize_t
+starlink_pmu_sysfs_event_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct perf_pmu_events_attr *eattr = container_of(attr,
+							  struct perf_pmu_events_attr, attr);
+
+	return sysfs_emit(buf, "event=0x%02llx\n", eattr->id);
+}
+
+static struct attribute *starlink_pmu_event_attrs[] = {
+	STARLINK_EVENT_ATTR(cycles, STARLINK_CYCLES),
+	STARLINK_EVENT_ATTR(read_request, CACHE_READ_REQUEST),
+	STARLINK_EVENT_ATTR(write_request, CACHE_WRITE_REQUEST),
+	STARLINK_EVENT_ATTR(release_request, CACHE_RELEASE_REQUEST),
+	STARLINK_EVENT_ATTR(read_hit, CACHE_READ_HIT),
+	STARLINK_EVENT_ATTR(read_miss, CACHE_READ_MISS),
+	STARLINK_EVENT_ATTR(write_hit, CACHE_WRITE_HIT),
+	STARLINK_EVENT_ATTR(write_miss, CACHE_WRITE_MISS),
+	STARLINK_EVENT_ATTR(writeback, CACHE_WRITEBACK),
+	NULL
+};
+
+static const struct attribute_group starlink_pmu_events_attr_group = {
+	.name = "events",
+	.attrs = starlink_pmu_event_attrs,
+};
+
+static ssize_t
+cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, &starlink_pmu->cpumask);
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *starlink_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static const struct attribute_group starlink_pmu_cpumask_attr_group = {
+	.attrs = starlink_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *starlink_pmu_attr_groups[] = {
+	&starlink_pmu_format_attr_group,
+	&starlink_pmu_events_attr_group,
+	&starlink_pmu_cpumask_attr_group,
+	NULL
+};
+
+static void starlink_pmu_set_event_period(struct perf_event *event)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = event->hw.idx;
+
+	/*
+	 * Program counter to half of it's max count to handle
+	 * cases of extreme interrupt latency.
+	 */
+	u64 val = STARLINK_PMU_COUNTER_MASK >> 1;
+
+	local64_set(&hwc->prev_count, val);
+	if (hwc->config == STARLINK_CYCLES)
+		writeq(val, starlink_pmu->pmu_base + STARLINK_PMU_CYCLE_COUNTER);
+	else
+		writeq(val, starlink_pmu->pmu_base + STARLINK_PMU_EVENT_COUNTER +
+		       idx * sizeof(u64));
+}
+
+static void starlink_pmu_counter_start(struct perf_event *event,
+				       struct starlink_pmu *starlink_pmu)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = event->hw.idx;
+	u64 val;
+
+	/*
+	 * Enable counter overflow interrupt[63:0],
+	 * which is mapped as follow:
+	 *
+	 * event counter 0	- Bit [0]
+	 * event counter 1	- Bit [1]
+	 * ...
+	 * cycle counter	- Bit [63]
+	 */
+	val = readq(starlink_pmu->pmu_base + STARLINK_PMU_INTERRUPT_ENABLE);
+
+	if (hwc->config == STARLINK_CYCLES) {
+		/*
+		 * Cycle count has its dedicated register, and it starts
+		 * counting as soon as STARLINK_PMU_GLOBAL_ENABLE is set.
+		 */
+		val |= STARLINK_PMU_CYCLE_OVERFLOW_MASK;
+	} else {
+		writeq(event->hw.config, starlink_pmu->pmu_base +
+		       STARLINK_PMU_EVENT_SELECT + idx * sizeof(u64));
+
+		val |= BIT_ULL(idx);
+	}
+
+	writeq(val, starlink_pmu->pmu_base + STARLINK_PMU_INTERRUPT_ENABLE);
+
+	writeq(STARLINK_PMU_GLOBAL_ENABLE, starlink_pmu->pmu_base +
+	       STARLINK_PMU_CONTROL);
+}
+
+static void starlink_pmu_counter_stop(struct perf_event *event,
+				      struct starlink_pmu *starlink_pmu)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = event->hw.idx;
+	u64 val;
+
+	val = readq(starlink_pmu->pmu_base + STARLINK_PMU_CONTROL);
+	val &= ~STARLINK_PMU_GLOBAL_ENABLE;
+	writeq(val, starlink_pmu->pmu_base + STARLINK_PMU_CONTROL);
+
+	val = readq(starlink_pmu->pmu_base + STARLINK_PMU_INTERRUPT_ENABLE);
+	if (hwc->config == STARLINK_CYCLES)
+		val &= ~STARLINK_PMU_CYCLE_OVERFLOW_MASK;
+	else
+		val &= ~BIT_ULL(idx);
+
+	writeq(val, starlink_pmu->pmu_base + STARLINK_PMU_INTERRUPT_ENABLE);
+}
+
+static void starlink_pmu_update(struct perf_event *event)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	u64 prev_raw_count, new_raw_count;
+	u64 oldval;
+	u64 delta;
+
+	do {
+		prev_raw_count = local64_read(&hwc->prev_count);
+		if (hwc->config == STARLINK_CYCLES)
+			new_raw_count = readq(starlink_pmu->pmu_base +
+					      STARLINK_PMU_CYCLE_COUNTER);
+		else
+			new_raw_count = readq(starlink_pmu->pmu_base +
+					      STARLINK_PMU_EVENT_COUNTER +
+					      idx * sizeof(u64));
+		oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					 new_raw_count);
+	} while (oldval != prev_raw_count);
+
+	delta = (new_raw_count - prev_raw_count) & STARLINK_PMU_COUNTER_MASK;
+	local64_add(delta, &event->count);
+}
+
+static void starlink_pmu_start(struct perf_event *event, int flags)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	if (flags & PERF_EF_RELOAD)
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
+
+	starlink_pmu_set_event_period(event);
+	starlink_pmu_counter_start(event, starlink_pmu);
+
+	perf_event_update_userpage(event);
+}
+
+static void starlink_pmu_stop(struct perf_event *event, int flags)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	starlink_pmu_counter_stop(event, starlink_pmu);
+	starlink_pmu_update(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int starlink_pmu_add(struct perf_event *event, int flags)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct starlink_hw_events *hw_events =
+					this_cpu_ptr(starlink_pmu->hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long *used_mask = hw_events->used_mask;
+	u32 n_events = STARLINK_PMU_NUM_COUNTERS;
+	int idx;
+
+	/*
+	 * Cycle counter has dedicated register to hold counter value.
+	 * Event other than cycle count has to be enabled through
+	 * event select register, and assigned with independent counter
+	 * as they appear.
+	 */
+
+	if (hwc->config == STARLINK_CYCLES) {
+		idx = STARLINK_PMU_IDX_CYCLE_COUNTER;
+	} else {
+		idx = find_first_zero_bit(used_mask, n_events);
+		/* All counter are in use */
+		if (idx < 0)
+			return idx;
+
+		set_bit(idx, used_mask);
+	}
+
+	hwc->idx = idx;
+	hw_events->events[idx] = event;
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START)
+		starlink_pmu_start(event, PERF_EF_RELOAD);
+
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void starlink_pmu_del(struct perf_event *event, int flags)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct starlink_hw_events *hw_events =
+					this_cpu_ptr(starlink_pmu->hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+
+	starlink_pmu_stop(event, PERF_EF_UPDATE);
+	hw_events->events[hwc->idx] = NULL;
+	clear_bit(hwc->idx, hw_events->used_mask);
+
+	perf_event_update_userpage(event);
+}
+
+static bool starlink_pmu_validate_event_group(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct perf_event *sibling;
+	int counter = 1;
+
+	/*
+	 * Ensure hardware events in the group are on the same PMU,
+	 * software events are acceptable.
+	 */
+	if (event->group_leader->pmu != event->pmu &&
+	    !is_software_event(event->group_leader))
+		return false;
+
+	for_each_sibling_event(sibling, leader) {
+		if (sibling->pmu != event->pmu && !is_software_event(sibling))
+			return false;
+
+		counter++;
+	}
+
+	return counter <= STARLINK_PMU_NUM_COUNTERS;
+}
+
+static int starlink_pmu_event_init(struct perf_event *event)
+{
+	struct starlink_pmu *starlink_pmu = to_starlink_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * Sampling is not supported, as counters are shared
+	 * by all CPU.
+	 */
+	if (hwc->sample_period)
+		return -EOPNOTSUPP;
+
+	/*
+	 * Per-task and attach to a task are not supported,
+	 * as uncore events are not specific to any CPU.
+	 */
+	if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK)
+		return -EOPNOTSUPP;
+
+	if (!starlink_pmu_validate_event_group(event))
+		return -EINVAL;
+
+	hwc->idx = -1;
+	hwc->config = event->attr.config;
+	event->cpu = cpumask_first(&starlink_pmu->cpumask);
+
+	return 0;
+}
+
+static irqreturn_t starlink_pmu_handle_irq(int irq_num, void *data)
+{
+	struct starlink_pmu *starlink_pmu = data;
+	struct starlink_hw_events *hw_events =
+			this_cpu_ptr(starlink_pmu->hw_events);
+	bool handled = false;
+	int idx;
+	u64 overflow_status;
+
+	for (idx = 0; idx < STARLINK_PMU_MAX_COUNTERS; idx++) {
+		struct perf_event *event = hw_events->events[idx];
+
+		if (!event)
+			continue;
+
+		overflow_status = readq(starlink_pmu->pmu_base +
+					STARLINK_PMU_COUNTER_OVERFLOW_STATUS);
+		if (!(overflow_status & BIT_ULL(idx)))
+			continue;
+
+		writeq(BIT_ULL(idx), starlink_pmu->pmu_base +
+		       STARLINK_PMU_COUNTER_OVERFLOW_STATUS);
+
+		starlink_pmu_update(event);
+		starlink_pmu_set_event_period(event);
+		handled = true;
+	}
+	return IRQ_RETVAL(handled);
+}
+
+static int starlink_setup_irqs(struct starlink_pmu *starlink_pmu,
+			       struct platform_device *pdev)
+{
+	int ret, irq;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return -EINVAL;
+
+	ret = devm_request_irq(&pdev->dev, irq, starlink_pmu_handle_irq,
+			       0, STARLINK_PMU_PDEV_NAME, starlink_pmu);
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret, "Failed to request IRQ\n");
+
+	starlink_pmu->irq = irq;
+
+	return 0;
+}
+
+static int starlink_pmu_pm_notify(struct notifier_block *b,
+				  unsigned long cmd, void *v)
+{
+	struct starlink_pmu *starlink_pmu = container_of(b, struct starlink_pmu,
+							 starlink_pmu_pm_nb);
+	struct starlink_hw_events *hw_events =
+					this_cpu_ptr(starlink_pmu->hw_events);
+	int enabled = bitmap_weight(hw_events->used_mask,
+				    STARLINK_PMU_MAX_COUNTERS);
+	struct perf_event *event;
+	int idx;
+
+	if (!enabled)
+		return NOTIFY_OK;
+
+	for (idx = 0; idx < STARLINK_PMU_MAX_COUNTERS; idx++) {
+		event = hw_events->events[idx];
+		if (!event)
+			continue;
+
+		switch (cmd) {
+		case CPU_PM_ENTER:
+			/* Stop and update the counter */
+			starlink_pmu_stop(event, PERF_EF_UPDATE);
+			break;
+		case CPU_PM_EXIT:
+		case CPU_PM_ENTER_FAILED:
+			/* Restore and enable the counter */
+			starlink_pmu_start(event, PERF_EF_RELOAD);
+			break;
+		default:
+			break;
+		}
+	}
+
+	return NOTIFY_OK;
+}
+
+static int starlink_pmu_pm_register(struct starlink_pmu *starlink_pmu)
+{
+	if (!IS_ENABLED(CONFIG_CPU_PM))
+		return 0;
+
+	starlink_pmu->starlink_pmu_pm_nb.notifier_call = starlink_pmu_pm_notify;
+	return cpu_pm_register_notifier(&starlink_pmu->starlink_pmu_pm_nb);
+}
+
+static void starlink_pmu_pm_unregister(struct starlink_pmu *starlink_pmu)
+{
+	if (!IS_ENABLED(CONFIG_CPU_PM))
+		return;
+
+	cpu_pm_unregister_notifier(&starlink_pmu->starlink_pmu_pm_nb);
+}
+
+static void starlink_pmu_destroy(struct starlink_pmu *starlink_pmu)
+{
+	starlink_pmu_pm_unregister(starlink_pmu);
+	cpuhp_state_remove_instance(starlink_pmu_cpuhp_state,
+				    &starlink_pmu->node);
+}
+
+static int starlink_pmu_probe(struct platform_device *pdev)
+{
+	struct starlink_pmu *starlink_pmu;
+	struct starlink_hw_events *hw_events;
+	struct resource *res;
+	int cpuid, i, ret;
+
+	starlink_pmu = devm_kzalloc(&pdev->dev, sizeof(*starlink_pmu), GFP_KERNEL);
+	if (!starlink_pmu)
+		return -ENOMEM;
+
+	starlink_pmu->pmu_base =
+			devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+	if (IS_ERR(starlink_pmu->pmu_base))
+		return PTR_ERR(starlink_pmu->pmu_base);
+
+	starlink_pmu->hw_events = alloc_percpu_gfp(struct starlink_hw_events,
+						   GFP_KERNEL);
+	if (!starlink_pmu->hw_events) {
+		dev_err(&pdev->dev, "Failed to allocate per-cpu PMU data\n");
+		return -ENOMEM;
+	}
+
+	for_each_possible_cpu(cpuid) {
+		hw_events = per_cpu_ptr(starlink_pmu->hw_events, cpuid);
+		for (i = 0; i < STARLINK_PMU_MAX_COUNTERS; i++)
+			hw_events->events[i] = NULL;
+	}
+
+	ret = starlink_setup_irqs(starlink_pmu, pdev);
+	if (ret)
+		return ret;
+
+	ret = cpuhp_state_add_instance(starlink_pmu_cpuhp_state,
+				       &starlink_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to register hotplug\n");
+		return ret;
+	}
+
+	ret = starlink_pmu_pm_register(starlink_pmu);
+	if (ret) {
+		cpuhp_state_remove_instance(starlink_pmu_cpuhp_state,
+					    &starlink_pmu->node);
+		return ret;
+	}
+
+	starlink_pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= starlink_pmu_event_init,
+		.add		= starlink_pmu_add,
+		.del		= starlink_pmu_del,
+		.start		= starlink_pmu_start,
+		.stop		= starlink_pmu_stop,
+		.read		= starlink_pmu_update,
+		.attr_groups	= starlink_pmu_attr_groups,
+	};
+
+	ret = perf_pmu_register(&starlink_pmu->pmu, STARLINK_PMU_PDEV_NAME, -1);
+	if (ret)
+		starlink_pmu_destroy(starlink_pmu);
+
+	return ret;
+}
+
+static const struct of_device_id starlink_pmu_of_match[] = {
+	{ .compatible = "starfive,jh8100-starlink-pmu" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, starlink_pmu_of_match);
+
+static struct platform_driver starlink_pmu_driver = {
+	.driver = {
+		.name	= STARLINK_PMU_PDEV_NAME,
+		.of_match_table = starlink_pmu_of_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = starlink_pmu_probe,
+};
+
+static int
+starlink_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct starlink_pmu *starlink_pmu = hlist_entry_safe(node,
+							     struct starlink_pmu,
+							     node);
+
+	if (cpumask_empty(&starlink_pmu->cpumask))
+		cpumask_set_cpu(cpu, &starlink_pmu->cpumask);
+
+	WARN_ON(irq_set_affinity(starlink_pmu->irq, cpumask_of(cpu)));
+
+	return 0;
+}
+
+static int
+starlink_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct starlink_pmu *starlink_pmu = hlist_entry_safe(node,
+							     struct starlink_pmu,
+							     node);
+	unsigned int target;
+
+	if (!cpumask_test_and_clear_cpu(cpu, &starlink_pmu->cpumask))
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&starlink_pmu->pmu, cpu, target);
+
+	cpumask_set_cpu(target, &starlink_pmu->cpumask);
+	WARN_ON(irq_set_affinity(starlink_pmu->irq, cpumask_of(target)));
+
+	return 0;
+}
+
+static int __init starlink_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "soc/starfive/starlink_pmu:online",
+				      starlink_pmu_online_cpu,
+				      starlink_pmu_offline_cpu);
+	if (ret < 0)
+		return ret;
+
+	starlink_pmu_cpuhp_state = ret;
+
+	return platform_driver_register(&starlink_pmu_driver);
+}
+
+device_initcall(starlink_pmu_init);

From 66461b43b0c05da2e7c606b9eea7f1f3b565b9c3 Mon Sep 17 00:00:00 2001
From: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Date: Thu, 29 Feb 2024 15:27:18 +0800
Subject: [PATCH 117/134] dt-bindings: perf: starfive: Add JH8100 StarLink PMU

Add device tree binding for StarFive's JH8100 StarLink PMU (Performance
Monitor Unit).

Signed-off-by: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240229072720.3987876-3-jisheng.teoh@starfivetech.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../perf/starfive,jh8100-starlink-pmu.yaml    | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/perf/starfive,jh8100-starlink-pmu.yaml

diff --git a/Documentation/devicetree/bindings/perf/starfive,jh8100-starlink-pmu.yaml b/Documentation/devicetree/bindings/perf/starfive,jh8100-starlink-pmu.yaml
new file mode 100644
index 000000000000..915c6b814026
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/starfive,jh8100-starlink-pmu.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/starfive,jh8100-starlink-pmu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH8100 StarLink PMU
+
+maintainers:
+  - Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
+
+description:
+  StarFive's JH8100 StarLink PMU integrates one or more CPU cores with a
+  shared L3 memory system. The PMU support overflow interrupt, up to
+  16 programmable 64bit event counters, and an independent 64bit cycle
+  counter. StarFive's JH8100 StarLink PMU is accessed via MMIO.
+
+properties:
+  compatible:
+    const: starfive,jh8100-starlink-pmu
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        pmu@12900000 {
+            compatible = "starfive,jh8100-starlink-pmu";
+            reg = <0x0 0x12900000 0x0 0x10000>;
+            interrupts = <34>;
+        };
+    };

From 49925c1c5a6c93a857b3dffcce3a7fb48ec72cbb Mon Sep 17 00:00:00 2001
From: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Date: Thu, 29 Feb 2024 15:27:19 +0800
Subject: [PATCH 118/134] docs: perf: Add description for StarFive's StarLink
 PMU

StarFive StarLink PMU support monitoring L3 memory system PMU events.
Add documentation to describe StarFive StarLink PMU support and it's
usage.

Signed-off-by: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Link: https://lore.kernel.org/r/20240229072720.3987876-4-jisheng.teoh@starfivetech.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/perf/index.rst      |  1 +
 .../perf/starfive_starlink_pmu.rst            | 46 +++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 Documentation/admin-guide/perf/starfive_starlink_pmu.rst

diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index f4a4513c526f..7eb3dcd6f4da 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -13,6 +13,7 @@ Performance monitor support
    imx-ddr
    qcom_l2_pmu
    qcom_l3_pmu
+   starfive_starlink_pmu
    arm-ccn
    arm-cmn
    xgene-pmu
diff --git a/Documentation/admin-guide/perf/starfive_starlink_pmu.rst b/Documentation/admin-guide/perf/starfive_starlink_pmu.rst
new file mode 100644
index 000000000000..2932ddb4eb76
--- /dev/null
+++ b/Documentation/admin-guide/perf/starfive_starlink_pmu.rst
@@ -0,0 +1,46 @@
+================================================
+StarFive StarLink Performance Monitor Unit (PMU)
+================================================
+
+StarFive StarLink Performance Monitor Unit (PMU) exists within the
+StarLink Coherent Network on Chip (CNoC) that connects multiple CPU
+clusters with an L3 memory system.
+
+The uncore PMU supports overflow interrupt, up to 16 programmable 64bit
+event counters, and an independent 64bit cycle counter.
+The PMU can only be accessed via Memory Mapped I/O and are common to the
+cores connected to the same PMU.
+
+Driver exposes supported PMU events in sysfs "events" directory under::
+
+  /sys/bus/event_source/devices/starfive_starlink_pmu/events/
+
+Driver exposes cpu used to handle PMU events in sysfs "cpumask" directory
+under::
+
+  /sys/bus/event_source/devices/starfive_starlink_pmu/cpumask/
+
+Driver describes the format of config (event ID) in sysfs "format" directory
+under::
+
+  /sys/bus/event_source/devices/starfive_starlink_pmu/format/
+
+Example of perf usage::
+
+	$ perf list
+
+	starfive_starlink_pmu/cycles/                      [Kernel PMU event]
+	starfive_starlink_pmu/read_hit/                    [Kernel PMU event]
+	starfive_starlink_pmu/read_miss/                   [Kernel PMU event]
+	starfive_starlink_pmu/read_request/                [Kernel PMU event]
+	starfive_starlink_pmu/release_request/             [Kernel PMU event]
+	starfive_starlink_pmu/write_hit/                   [Kernel PMU event]
+	starfive_starlink_pmu/write_miss/                  [Kernel PMU event]
+	starfive_starlink_pmu/write_request/               [Kernel PMU event]
+	starfive_starlink_pmu/writeback/                   [Kernel PMU event]
+
+
+	$ perf stat -a -e /starfive_starlink_pmu/cycles/ sleep 1
+
+Sampling is not supported. As a result, "perf record" is not supported.
+Attaching to a task is not supported, only system-wide counting is supported.

From b9f71ab2152e5b344c02eb3ff43637162aaf29e6 Mon Sep 17 00:00:00 2001
From: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Date: Thu, 29 Feb 2024 15:27:20 +0800
Subject: [PATCH 119/134] MAINTAINERS: Add entry for StarFive StarLink PMU

Add maintainer entry for StarFive StarLink PMU driver, and mark it as
"Maintained"

Signed-off-by: Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
Link: https://lore.kernel.org/r/20240229072720.3987876-5-jisheng.teoh@starfivetech.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 960512bec428..b7306d64d58b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20940,6 +20940,13 @@ S:	Maintained
 T:	git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/
 F:	Documentation/devicetree/bindings/soc/starfive/
 
+STARFIVE STARLINK PMU DRIVER
+M:	Ji Sheng Teoh <jisheng.teoh@starfivetech.com>
+S:	Maintained
+F:	Documentation/admin-guide/perf/starfive_starlink_pmu.rst
+F:	Documentation/devicetree/bindings/perf/starfive,jh8100-starlink-pmu.yaml
+F:	drivers/perf/starfive_starlink_pmu.c
+
 STARFIVE TRNG DRIVER
 M:	Jia Jie Ho <jiajie.ho@starfivetech.com>
 S:	Supported

From f0dbc6d0de38df42184776aa8564c12ceb6f1d61 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 5 Mar 2024 10:56:49 +0000
Subject: [PATCH 120/134] perf: starfive: Only allow COMPILE_TEST for 64-bit
 architectures

The kbuild robot exploded while wasting its time building the Starfive
PMU driver for the 32-bit PA-RISC and Hexagon architectures.

Adjust the Kconfig dependencies so that COMPILE_TEST is only applicable
for 64-bit architectures (which implement writeq()).

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 2774b18f4e27..004d86230aa6 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -87,7 +87,7 @@ config RISCV_PMU_SBI
 	  filtering, counter configuration.
 
 config STARFIVE_STARLINK_PMU
-	depends on ARCH_STARFIVE || COMPILE_TEST
+	depends on ARCH_STARFIVE || (COMPILE_TEST && 64BIT)
 	bool "StarFive StarLink PMU"
 	help
 	   Provide support for StarLink Performance Monitor Unit.

From b037e40a6af2b056f7f15d9aabe7e9a9a7149ff3 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Tue, 5 Mar 2024 20:25:17 +0800
Subject: [PATCH 121/134] docs: perf: Fix build warning of hisi-pcie-pmu.rst

`make htmldocs SPHINXDIRS="admin-guide"` shows below warnings:
Documentation/admin-guide/perf/hisi-pcie-pmu.rst:48: ERROR: Unexpected indentation.
Documentation/admin-guide/perf/hisi-pcie-pmu.rst:49: WARNING: Block quote ends without a blank line; unexpected unindent.

Fix this.

Closes: https://lore.kernel.org/lkml/20231011172250.5a6498e5@canb.auug.org.au/
Fixes: 89a032923d4b ("docs: perf: Update usage for target filter of hisi-pcie-pmu")
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20240305122517.12179-1-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/perf/hisi-pcie-pmu.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
index 678d3865560c..5541ff40e06a 100644
--- a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
+++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@ -44,6 +44,7 @@ The related events usually used to calculate the bandwidth, latency or others.
 They need to start and end counting at the same time, therefore related events
 are best used in the same event group to get the expected value. There are two
 ways to know if they are related events:
+
 a) By event name, such as the latency events "xxx_latency, xxx_cnt" or
    bandwidth events "xxx_flux, xxx_time".
 b) By event type, such as "event=0xXXXX, event=0x1XXXX".

From cc9f69a3dad3b64b299dc2d5f95935fe16cb8b79 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:46 +0000
Subject: [PATCH 122/134] arm64/cpufeature: Hook new identification registers
 up to cpufeature

The 2023 architecture extensions have defined several new ID registers,
hook them up to the cpufeature code so we can add feature checks and hwcaps
based on their contents.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-1-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpu.h   |  3 +++
 arch/arm64/kernel/cpufeature.c | 28 ++++++++++++++++++++++++++++
 arch/arm64/kernel/cpuinfo.c    |  3 +++
 3 files changed, 34 insertions(+)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index b1e43f56ee46..96379be913cd 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -52,14 +52,17 @@ struct cpuinfo_arm64 {
 	u64		reg_id_aa64isar0;
 	u64		reg_id_aa64isar1;
 	u64		reg_id_aa64isar2;
+	u64		reg_id_aa64isar3;
 	u64		reg_id_aa64mmfr0;
 	u64		reg_id_aa64mmfr1;
 	u64		reg_id_aa64mmfr2;
 	u64		reg_id_aa64mmfr3;
 	u64		reg_id_aa64pfr0;
 	u64		reg_id_aa64pfr1;
+	u64		reg_id_aa64pfr2;
 	u64		reg_id_aa64zfr0;
 	u64		reg_id_aa64smfr0;
+	u64		reg_id_aa64fpfr0;
 
 	struct cpuinfo_32bit	aarch32;
 };
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8d1a634a403e..eae59ec0f4b0 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -234,6 +234,10 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0),
@@ -267,6 +271,10 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = {
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
 		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0),
@@ -319,6 +327,10 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = {
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0),
@@ -702,10 +714,12 @@ static const struct __ftr_reg_entry {
 			       &id_aa64pfr0_override),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1,
 			       &id_aa64pfr1_override),
+	ARM64_FTR_REG(SYS_ID_AA64PFR2_EL1, ftr_id_aa64pfr2),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0,
 			       &id_aa64zfr0_override),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0,
 			       &id_aa64smfr0_override),
+	ARM64_FTR_REG(SYS_ID_AA64FPFR0_EL1, ftr_id_aa64fpfr0),
 
 	/* Op1 = 0, CRn = 0, CRm = 5 */
 	ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
@@ -717,6 +731,7 @@ static const struct __ftr_reg_entry {
 			       &id_aa64isar1_override),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2,
 			       &id_aa64isar2_override),
+	ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3),
 
 	/* Op1 = 0, CRn = 0, CRm = 7 */
 	ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
@@ -1043,14 +1058,17 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0);
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2);
+	init_cpu_ftr_reg(SYS_ID_AA64ISAR3_EL1, info->reg_id_aa64isar3);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
+	init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2);
 	init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0);
+	init_cpu_ftr_reg(SYS_ID_AA64FPFR0_EL1, info->reg_id_aa64fpfr0);
 
 	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
 		init_32bit_cpu_features(&info->aarch32);
@@ -1272,6 +1290,8 @@ void update_cpu_features(int cpu,
 				      info->reg_id_aa64isar1, boot->reg_id_aa64isar1);
 	taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu,
 				      info->reg_id_aa64isar2, boot->reg_id_aa64isar2);
+	taint |= check_update_ftr_reg(SYS_ID_AA64ISAR3_EL1, cpu,
+				      info->reg_id_aa64isar3, boot->reg_id_aa64isar3);
 
 	/*
 	 * Differing PARange support is fine as long as all peripherals and
@@ -1291,6 +1311,8 @@ void update_cpu_features(int cpu,
 				      info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0);
 	taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu,
 				      info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1);
+	taint |= check_update_ftr_reg(SYS_ID_AA64PFR2_EL1, cpu,
+				      info->reg_id_aa64pfr2, boot->reg_id_aa64pfr2);
 
 	taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu,
 				      info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0);
@@ -1298,6 +1320,9 @@ void update_cpu_features(int cpu,
 	taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu,
 				      info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0);
 
+	taint |= check_update_ftr_reg(SYS_ID_AA64FPFR0_EL1, cpu,
+				      info->reg_id_aa64fpfr0, boot->reg_id_aa64fpfr0);
+
 	/* Probe vector lengths */
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
@@ -1410,8 +1435,10 @@ u64 __read_sysreg_by_encoding(u32 sys_id)
 
 	read_sysreg_case(SYS_ID_AA64PFR0_EL1);
 	read_sysreg_case(SYS_ID_AA64PFR1_EL1);
+	read_sysreg_case(SYS_ID_AA64PFR2_EL1);
 	read_sysreg_case(SYS_ID_AA64ZFR0_EL1);
 	read_sysreg_case(SYS_ID_AA64SMFR0_EL1);
+	read_sysreg_case(SYS_ID_AA64FPFR0_EL1);
 	read_sysreg_case(SYS_ID_AA64DFR0_EL1);
 	read_sysreg_case(SYS_ID_AA64DFR1_EL1);
 	read_sysreg_case(SYS_ID_AA64MMFR0_EL1);
@@ -1421,6 +1448,7 @@ u64 __read_sysreg_by_encoding(u32 sys_id)
 	read_sysreg_case(SYS_ID_AA64ISAR0_EL1);
 	read_sysreg_case(SYS_ID_AA64ISAR1_EL1);
 	read_sysreg_case(SYS_ID_AA64ISAR2_EL1);
+	read_sysreg_case(SYS_ID_AA64ISAR3_EL1);
 
 	read_sysreg_case(SYS_CNTFRQ_EL0);
 	read_sysreg_case(SYS_CTR_EL0);
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 47043c0d95ec..12b192060156 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -443,14 +443,17 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
 	info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
 	info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1);
+	info->reg_id_aa64isar3 = read_cpuid(ID_AA64ISAR3_EL1);
 	info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
 	info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
 	info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1);
 	info->reg_id_aa64mmfr3 = read_cpuid(ID_AA64MMFR3_EL1);
 	info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
 	info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
+	info->reg_id_aa64pfr2 = read_cpuid(ID_AA64PFR2_EL1);
 	info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);
 	info->reg_id_aa64smfr0 = read_cpuid(ID_AA64SMFR0_EL1);
+	info->reg_id_aa64fpfr0 = read_cpuid(ID_AA64FPFR0_EL1);
 
 	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
 		info->reg_gmid = read_cpuid(GMID_EL1);

From b6c0b424cb91a864e62533a6520743ddcdde5270 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:47 +0000
Subject: [PATCH 123/134] arm64/fpsimd: Enable host kernel access to FPMR

FEAT_FPMR provides a new generally accessible architectural register FPMR.
This is only accessible to EL0 and EL1 when HCRX_EL2.EnFPM is set to 1,
do this when the host is running. The guest part will be done along with
context switching the new register and exposing it via guest management.

Acked-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-2-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_arm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 3c6f8ba1e479..7f45ce9170bb 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -105,7 +105,7 @@
 #define HCRX_GUEST_FLAGS \
 	(HCRX_EL2_SMPME | HCRX_EL2_TCR2En | \
 	 (cpus_have_final_cap(ARM64_HAS_MOPS) ? (HCRX_EL2_MSCEn | HCRX_EL2_MCE2) : 0))
-#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En)
+#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM)
 
 /* TCR_EL2 Registers bits */
 #define TCR_EL2_DS		(1UL << 32)

From 203f2b95a882dc46dd9873562167db69a1f61711 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:48 +0000
Subject: [PATCH 124/134] arm64/fpsimd: Support FEAT_FPMR

FEAT_FPMR defines a new EL0 accessible register FPMR use to configure the
FP8 related features added to the architecture at the same time. Detect
support for this register and context switch it for EL0 when present.

Due to the sharing of responsibility for saving floating point state
between the host kernel and KVM FP8 support is not yet implemented in KVM
and a stub similar to that used for SVCR is provided for FPMR in order to
avoid bisection issues. To make it easier to share host state with the
hypervisor we store FPMR as a hardened usercopy field in uw (along with
some padding).

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-3-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  5 +++++
 arch/arm64/include/asm/fpsimd.h     |  2 ++
 arch/arm64/include/asm/kvm_host.h   |  1 +
 arch/arm64/include/asm/processor.h  |  4 ++++
 arch/arm64/kernel/cpufeature.c      |  9 +++++++++
 arch/arm64/kernel/fpsimd.c          | 13 +++++++++++++
 arch/arm64/kvm/fpsimd.c             |  1 +
 arch/arm64/tools/cpucaps            |  1 +
 8 files changed, 36 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 21c824edf8ce..34fcdbc65d7d 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -768,6 +768,11 @@ static __always_inline bool system_supports_tpidr2(void)
 	return system_supports_sme();
 }
 
+static __always_inline bool system_supports_fpmr(void)
+{
+	return alternative_has_cap_unlikely(ARM64_HAS_FPMR);
+}
+
 static __always_inline bool system_supports_cnp(void)
 {
 	return alternative_has_cap_unlikely(ARM64_HAS_CNP);
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 50e5f25d3024..74afca3bd312 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -89,6 +89,7 @@ struct cpu_fp_state {
 	void *sve_state;
 	void *sme_state;
 	u64 *svcr;
+	u64 *fpmr;
 	unsigned int sve_vl;
 	unsigned int sme_vl;
 	enum fp_type *fp_type;
@@ -154,6 +155,7 @@ extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused);
 extern void cpu_enable_sme(const struct arm64_cpu_capabilities *__unused);
 extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused);
 extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused);
+extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused);
 
 extern u64 read_smcr_features(void);
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 21c57b812569..b779cbc2211c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -543,6 +543,7 @@ struct kvm_vcpu_arch {
 	enum fp_type fp_type;
 	unsigned int sve_max_vl;
 	u64 svcr;
+	u64 fpmr;
 
 	/* Stage 2 paging state used by the hardware on next switch */
 	struct kvm_s2_mmu *hw_mmu;
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5b0a04810b23..f77371232d8c 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -155,6 +155,8 @@ struct thread_struct {
 	struct {
 		unsigned long	tp_value;	/* TLS register */
 		unsigned long	tp2_value;
+		u64		fpmr;
+		unsigned long	pad;
 		struct user_fpsimd_state fpsimd_state;
 	} uw;
 
@@ -253,6 +255,8 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
 	BUILD_BUG_ON(sizeof_field(struct thread_struct, uw) !=
 		     sizeof_field(struct thread_struct, uw.tp_value) +
 		     sizeof_field(struct thread_struct, uw.tp2_value) +
+		     sizeof_field(struct thread_struct, uw.fpmr) +
+		     sizeof_field(struct thread_struct, uw.pad) +
 		     sizeof_field(struct thread_struct, uw.fpsimd_state));
 
 	*offset = offsetof(struct thread_struct, uw);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index eae59ec0f4b0..0263565f617a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -272,6 +272,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
@@ -2767,6 +2768,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.matches = has_lpa2,
 	},
+	{
+		.desc = "FPMR",
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_HAS_FPMR,
+		.matches = has_cpuid_feature,
+		.cpu_enable = cpu_enable_fpmr,
+		ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP)
+	},
 	{},
 };
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index a5dc6f764195..8e24b5e5e192 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -359,6 +359,9 @@ static void task_fpsimd_load(void)
 	WARN_ON(preemptible());
 	WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE));
 
+	if (system_supports_fpmr())
+		write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR);
+
 	if (system_supports_sve() || system_supports_sme()) {
 		switch (current->thread.fp_type) {
 		case FP_STATE_FPSIMD:
@@ -446,6 +449,9 @@ static void fpsimd_save_user_state(void)
 	if (test_thread_flag(TIF_FOREIGN_FPSTATE))
 		return;
 
+	if (system_supports_fpmr())
+		*(last->fpmr) = read_sysreg_s(SYS_FPMR);
+
 	/*
 	 * If a task is in a syscall the ABI allows us to only
 	 * preserve the state shared with FPSIMD so don't bother
@@ -688,6 +694,12 @@ static void sve_to_fpsimd(struct task_struct *task)
 	}
 }
 
+void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p)
+{
+	write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK,
+		       SYS_SCTLR_EL1);
+}
+
 #ifdef CONFIG_ARM64_SVE
 /*
  * Call __sve_free() directly only if you know task can't be scheduled
@@ -1680,6 +1692,7 @@ static void fpsimd_bind_task_to_cpu(void)
 	last->sve_vl = task_get_sve_vl(current);
 	last->sme_vl = task_get_sme_vl(current);
 	last->svcr = &current->thread.svcr;
+	last->fpmr = &current->thread.uw.fpmr;
 	last->fp_type = &current->thread.fp_type;
 	last->to_save = FP_STATE_CURRENT;
 	current->thread.fpsimd_cpu = smp_processor_id();
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 8c1d0d4853df..e3e611e30e91 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -153,6 +153,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 		fp_state.sve_vl = vcpu->arch.sve_max_vl;
 		fp_state.sme_state = NULL;
 		fp_state.svcr = &vcpu->arch.svcr;
+		fp_state.fpmr = &vcpu->arch.fpmr;
 		fp_state.fp_type = &vcpu->arch.fp_type;
 
 		if (vcpu_has_sve(vcpu))
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index b912b1409fc0..63283550c8e8 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -26,6 +26,7 @@ HAS_ECV
 HAS_ECV_CNTPOFF
 HAS_EPAN
 HAS_EVT
+HAS_FPMR
 HAS_FGT
 HAS_FPSIMD
 HAS_GENERIC_AUTH

From 8c46def44409fc914278630b7ba5ac142ab7c4f4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:49 +0000
Subject: [PATCH 125/134] arm64/signal: Add FPMR signal handling

Expose FPMR in the signal context on systems where it is supported. The
kernel validates the exact size of the FPSIMD registers so we can't readily
add it to fpsimd_context without disruption.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-4-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/sigcontext.h |  8 ++++
 arch/arm64/kernel/signal.c               | 59 ++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index f23c1dc3f002..8a45b7a411e0 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -152,6 +152,14 @@ struct tpidr2_context {
 	__u64 tpidr2;
 };
 
+/* FPMR context */
+#define FPMR_MAGIC	0x46504d52
+
+struct fpmr_context {
+	struct _aarch64_ctx head;
+	__u64 fpmr;
+};
+
 #define ZA_MAGIC	0x54366345
 
 struct za_context {
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 0e8beb3349ea..460823baa603 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -60,6 +60,7 @@ struct rt_sigframe_user_layout {
 	unsigned long tpidr2_offset;
 	unsigned long za_offset;
 	unsigned long zt_offset;
+	unsigned long fpmr_offset;
 	unsigned long extra_offset;
 	unsigned long end_offset;
 };
@@ -182,6 +183,8 @@ struct user_ctxs {
 	u32 za_size;
 	struct zt_context __user *zt;
 	u32 zt_size;
+	struct fpmr_context __user *fpmr;
+	u32 fpmr_size;
 };
 
 static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
@@ -227,6 +230,33 @@ static int restore_fpsimd_context(struct user_ctxs *user)
 	return err ? -EFAULT : 0;
 }
 
+static int preserve_fpmr_context(struct fpmr_context __user *ctx)
+{
+	int err = 0;
+
+	current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR);
+
+	__put_user_error(FPMR_MAGIC, &ctx->head.magic, err);
+	__put_user_error(sizeof(*ctx), &ctx->head.size, err);
+	__put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err);
+
+	return err;
+}
+
+static int restore_fpmr_context(struct user_ctxs *user)
+{
+	u64 fpmr;
+	int err = 0;
+
+	if (user->fpmr_size != sizeof(*user->fpmr))
+		return -EINVAL;
+
+	__get_user_error(fpmr, &user->fpmr->fpmr, err);
+	if (!err)
+		write_sysreg_s(fpmr, SYS_FPMR);
+
+	return err;
+}
 
 #ifdef CONFIG_ARM64_SVE
 
@@ -590,6 +620,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 	user->tpidr2 = NULL;
 	user->za = NULL;
 	user->zt = NULL;
+	user->fpmr = NULL;
 
 	if (!IS_ALIGNED((unsigned long)base, 16))
 		goto invalid;
@@ -684,6 +715,17 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			user->zt_size = size;
 			break;
 
+		case FPMR_MAGIC:
+			if (!system_supports_fpmr())
+				goto invalid;
+
+			if (user->fpmr)
+				goto invalid;
+
+			user->fpmr = (struct fpmr_context __user *)head;
+			user->fpmr_size = size;
+			break;
+
 		case EXTRA_MAGIC:
 			if (have_extra_context)
 				goto invalid;
@@ -806,6 +848,9 @@ static int restore_sigframe(struct pt_regs *regs,
 	if (err == 0 && system_supports_tpidr2() && user.tpidr2)
 		err = restore_tpidr2_context(&user);
 
+	if (err == 0 && system_supports_fpmr() && user.fpmr)
+		err = restore_fpmr_context(&user);
+
 	if (err == 0 && system_supports_sme() && user.za)
 		err = restore_za_context(&user);
 
@@ -928,6 +973,13 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 		}
 	}
 
+	if (system_supports_fpmr()) {
+		err = sigframe_alloc(user, &user->fpmr_offset,
+				     sizeof(struct fpmr_context));
+		if (err)
+			return err;
+	}
+
 	return sigframe_alloc_end(user);
 }
 
@@ -983,6 +1035,13 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		err |= preserve_tpidr2_context(tpidr2_ctx);
 	}
 
+	/* FPMR if supported */
+	if (system_supports_fpmr() && err == 0) {
+		struct fpmr_context __user *fpmr_ctx =
+			apply_user_offset(user, user->fpmr_offset);
+		err |= preserve_fpmr_context(fpmr_ctx);
+	}
+
 	/* ZA state if present */
 	if (system_supports_sme() && err == 0 && user->za_offset) {
 		struct za_context __user *za_ctx =

From 4035c22ef7d43a6c00d6a6584c60e902b95b46af Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:50 +0000
Subject: [PATCH 126/134] arm64/ptrace: Expose FPMR via ptrace

Add a new regset to expose FPMR via ptrace. It is not added to the FPSIMD
registers since that structure is exposed elsewhere without any allowance
for extension we don't add there.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-5-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/ptrace.c | 42 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/elf.h   |  1 +
 2 files changed, 43 insertions(+)

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index dc6cf0e37194..aacb45bd36e6 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -698,6 +698,39 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset,
 	return ret;
 }
 
+static int fpmr_get(struct task_struct *target, const struct user_regset *regset,
+		   struct membuf to)
+{
+	if (!system_supports_fpmr())
+		return -EINVAL;
+
+	if (target == current)
+		fpsimd_preserve_current_state();
+
+	return membuf_store(&to, target->thread.uw.fpmr);
+}
+
+static int fpmr_set(struct task_struct *target, const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	unsigned long fpmr;
+
+	if (!system_supports_fpmr())
+		return -EINVAL;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count);
+	if (ret)
+		return ret;
+
+	target->thread.uw.fpmr = fpmr;
+
+	fpsimd_flush_task_state(target);
+
+	return 0;
+}
+
 static int system_call_get(struct task_struct *target,
 			   const struct user_regset *regset,
 			   struct membuf to)
@@ -1419,6 +1452,7 @@ enum aarch64_regset {
 	REGSET_HW_BREAK,
 	REGSET_HW_WATCH,
 #endif
+	REGSET_FPMR,
 	REGSET_SYSTEM_CALL,
 #ifdef CONFIG_ARM64_SVE
 	REGSET_SVE,
@@ -1497,6 +1531,14 @@ static const struct user_regset aarch64_regsets[] = {
 		.regset_get = system_call_get,
 		.set = system_call_set,
 	},
+	[REGSET_FPMR] = {
+		.core_note_type = NT_ARM_FPMR,
+		.n = 1,
+		.size = sizeof(u64),
+		.align = sizeof(u64),
+		.regset_get = fpmr_get,
+		.set = fpmr_set,
+	},
 #ifdef CONFIG_ARM64_SVE
 	[REGSET_SVE] = { /* Scalable Vector Extension */
 		.core_note_type = NT_ARM_SVE,
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 9417309b7230..b54b313bcf07 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -440,6 +440,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_SSVE	0x40b		/* ARM Streaming SVE registers */
 #define NT_ARM_ZA	0x40c		/* ARM SME ZA registers */
 #define NT_ARM_ZT	0x40d		/* ARM SME ZT registers */
+#define NT_ARM_FPMR	0x40e		/* ARM floating point mode register */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */

From c1932cac7902a8b0f7355515917dedc5412eb15d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:51 +0000
Subject: [PATCH 127/134] arm64/hwcap: Define hwcaps for 2023 DPISA features

The 2023 architecture extensions include a large number of floating point
features, most of which simply add new instructions. Add hwcaps so that
userspace can enumerate these features.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-6-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/elf_hwcaps.rst | 49 +++++++++++++++++++++++++
 arch/arm64/include/asm/hwcap.h          | 15 ++++++++
 arch/arm64/include/uapi/asm/hwcap.h     | 15 ++++++++
 arch/arm64/kernel/cpufeature.c          | 35 ++++++++++++++++++
 arch/arm64/kernel/cpuinfo.c             | 15 ++++++++
 5 files changed, 129 insertions(+)

diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index ced7b335e2e0..448c1664879b 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -317,6 +317,55 @@ HWCAP2_LRCPC3
 HWCAP2_LSE128
     Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0011.
 
+HWCAP2_FPMR
+    Functionality implied by ID_AA64PFR2_EL1.FMR == 0b0001.
+
+HWCAP2_LUT
+    Functionality implied by ID_AA64ISAR2_EL1.LUT == 0b0001.
+
+HWCAP2_FAMINMAX
+    Functionality implied by ID_AA64ISAR3_EL1.FAMINMAX == 0b0001.
+
+HWCAP2_F8CVT
+    Functionality implied by ID_AA64FPFR0_EL1.F8CVT == 0b1.
+
+HWCAP2_F8FMA
+    Functionality implied by ID_AA64FPFR0_EL1.F8FMA == 0b1.
+
+HWCAP2_F8DP4
+    Functionality implied by ID_AA64FPFR0_EL1.F8DP4 == 0b1.
+
+HWCAP2_F8DP2
+    Functionality implied by ID_AA64FPFR0_EL1.F8DP2 == 0b1.
+
+HWCAP2_F8E4M3
+    Functionality implied by ID_AA64FPFR0_EL1.F8E4M3 == 0b1.
+
+HWCAP2_F8E5M2
+    Functionality implied by ID_AA64FPFR0_EL1.F8E5M2 == 0b1.
+
+HWCAP2_SME_LUTV2
+    Functionality implied by ID_AA64SMFR0_EL1.LUTv2 == 0b1.
+
+HWCAP2_SME_F8F16
+    Functionality implied by ID_AA64SMFR0_EL1.F8F16 == 0b1.
+
+HWCAP2_SME_F8F32
+    Functionality implied by ID_AA64SMFR0_EL1.F8F32 == 0b1.
+
+HWCAP2_SME_SF8FMA
+    Functionality implied by ID_AA64SMFR0_EL1.SF8FMA == 0b1.
+
+HWCAP2_SME_SF8DP4
+    Functionality implied by ID_AA64SMFR0_EL1.SF8DP4 == 0b1.
+
+HWCAP2_SME_SF8DP2
+    Functionality implied by ID_AA64SMFR0_EL1.SF8DP2 == 0b1.
+
+HWCAP2_SME_SF8DP4
+    Functionality implied by ID_AA64SMFR0_EL1.SF8DP4 == 0b1.
+
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index cd71e09ea14d..4edd3b61df11 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -142,6 +142,21 @@
 #define KERNEL_HWCAP_SVE_B16B16		__khwcap2_feature(SVE_B16B16)
 #define KERNEL_HWCAP_LRCPC3		__khwcap2_feature(LRCPC3)
 #define KERNEL_HWCAP_LSE128		__khwcap2_feature(LSE128)
+#define KERNEL_HWCAP_FPMR		__khwcap2_feature(FPMR)
+#define KERNEL_HWCAP_LUT		__khwcap2_feature(LUT)
+#define KERNEL_HWCAP_FAMINMAX		__khwcap2_feature(FAMINMAX)
+#define KERNEL_HWCAP_F8CVT		__khwcap2_feature(F8CVT)
+#define KERNEL_HWCAP_F8FMA		__khwcap2_feature(F8FMA)
+#define KERNEL_HWCAP_F8DP4		__khwcap2_feature(F8DP4)
+#define KERNEL_HWCAP_F8DP2		__khwcap2_feature(F8DP2)
+#define KERNEL_HWCAP_F8E4M3		__khwcap2_feature(F8E4M3)
+#define KERNEL_HWCAP_F8E5M2		__khwcap2_feature(F8E5M2)
+#define KERNEL_HWCAP_SME_LUTV2		__khwcap2_feature(SME_LUTV2)
+#define KERNEL_HWCAP_SME_F8F16		__khwcap2_feature(SME_F8F16)
+#define KERNEL_HWCAP_SME_F8F32		__khwcap2_feature(SME_F8F32)
+#define KERNEL_HWCAP_SME_SF8FMA		__khwcap2_feature(SME_SF8FMA)
+#define KERNEL_HWCAP_SME_SF8DP4		__khwcap2_feature(SME_SF8DP4)
+#define KERNEL_HWCAP_SME_SF8DP2		__khwcap2_feature(SME_SF8DP2)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 5023599fa278..285610e626f5 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -107,5 +107,20 @@
 #define HWCAP2_SVE_B16B16	(1UL << 45)
 #define HWCAP2_LRCPC3		(1UL << 46)
 #define HWCAP2_LSE128		(1UL << 47)
+#define HWCAP2_FPMR		(1UL << 48)
+#define HWCAP2_LUT		(1UL << 49)
+#define HWCAP2_FAMINMAX		(1UL << 50)
+#define HWCAP2_F8CVT		(1UL << 51)
+#define HWCAP2_F8FMA		(1UL << 52)
+#define HWCAP2_F8DP4		(1UL << 53)
+#define HWCAP2_F8DP2		(1UL << 54)
+#define HWCAP2_F8E4M3		(1UL << 55)
+#define HWCAP2_F8E5M2		(1UL << 56)
+#define HWCAP2_SME_LUTV2	(1UL << 57)
+#define HWCAP2_SME_F8F16	(1UL << 58)
+#define HWCAP2_SME_F8F32	(1UL << 59)
+#define HWCAP2_SME_SF8FMA	(1UL << 60)
+#define HWCAP2_SME_SF8DP4	(1UL << 61)
+#define HWCAP2_SME_SF8DP2	(1UL << 62)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 0263565f617a..aefda789f510 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -220,6 +220,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0),
@@ -235,6 +236,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
@@ -303,6 +305,8 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
 static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
@@ -315,6 +319,10 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16B16_SHIFT, 1, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F16_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F32_SHIFT, 1, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
@@ -325,10 +333,22 @@ static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
 		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8FMA_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+		       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0),
 	ARM64_FTR_END,
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8CVT_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0),
 	ARM64_FTR_END,
 };
 
@@ -2859,6 +2879,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD),
 	HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP),
 	HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT),
+	HWCAP_CAP(ID_AA64PFR2_EL1, FPMR, IMP, CAP_HWCAP, KERNEL_HWCAP_FPMR),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT),
@@ -2872,6 +2893,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM),
+	HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT),
+	HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX),
 	HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT),
 #ifdef CONFIG_ARM64_SVE
 	HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE),
@@ -2912,6 +2935,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 #ifdef CONFIG_ARM64_SME
 	HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+	HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
@@ -2919,12 +2943,23 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16),
+	HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16),
+	HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32),
 	HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
+	HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA),
+	HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4),
+	HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2),
 #endif /* CONFIG_ARM64_SME */
+	HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT),
+	HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA),
+	HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4),
+	HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2),
+	HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3),
+	HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2),
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 12b192060156..f0abb150f73e 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -128,6 +128,21 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_SVE_B16B16]	= "sveb16b16",
 	[KERNEL_HWCAP_LRCPC3]		= "lrcpc3",
 	[KERNEL_HWCAP_LSE128]		= "lse128",
+	[KERNEL_HWCAP_FPMR]		= "fpmr",
+	[KERNEL_HWCAP_LUT]		= "lut",
+	[KERNEL_HWCAP_FAMINMAX]		= "faminmax",
+	[KERNEL_HWCAP_F8CVT]		= "f8cvt",
+	[KERNEL_HWCAP_F8FMA]		= "f8fma",
+	[KERNEL_HWCAP_F8DP4]		= "f8dp4",
+	[KERNEL_HWCAP_F8DP2]		= "f8dp2",
+	[KERNEL_HWCAP_F8E4M3]		= "f8e4m3",
+	[KERNEL_HWCAP_F8E5M2]		= "f8e5m2",
+	[KERNEL_HWCAP_SME_LUTV2]	= "smelutv2",
+	[KERNEL_HWCAP_SME_F8F16]	= "smef8f16",
+	[KERNEL_HWCAP_SME_F8F32]	= "smef8f32",
+	[KERNEL_HWCAP_SME_SF8FMA]	= "smesf8fma",
+	[KERNEL_HWCAP_SME_SF8DP4]	= "smesf8dp4",
+	[KERNEL_HWCAP_SME_SF8DP2]	= "smesf8dp2",
 };
 
 #ifdef CONFIG_COMPAT

From f4dcccdda5867bb68f48046e86e5a7ccaae10d27 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:52 +0000
Subject: [PATCH 128/134] kselftest/arm64: Handle FPMR context in generic
 signal frame parser

Teach the generic signal frame parsing code about the newly added FPMR
frame, avoiding warnings every time one is generated.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-7-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/signal/testcases/testcases.c  | 8 ++++++++
 .../testing/selftests/arm64/signal/testcases/testcases.h  | 1 +
 2 files changed, 9 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index 9f580b55b388..674b88cc8c39 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -209,6 +209,14 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 			zt = (struct zt_context *)head;
 			new_flags |= ZT_CTX;
 			break;
+		case FPMR_MAGIC:
+			if (flags & FPMR_CTX)
+				*err = "Multiple FPMR_MAGIC";
+			else if (head->size !=
+				 sizeof(struct fpmr_context))
+				*err = "Bad size for fpmr_context";
+			new_flags |= FPMR_CTX;
+			break;
 		case EXTRA_MAGIC:
 			if (flags & EXTRA_CTX)
 				*err = "Multiple EXTRA_MAGIC";
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index a08ab0d6207a..7727126347e0 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -19,6 +19,7 @@
 #define ZA_CTX		(1 << 2)
 #define EXTRA_CTX	(1 << 3)
 #define ZT_CTX		(1 << 4)
+#define FPMR_CTX	(1 << 5)
 
 #define KSFT_BAD_MAGIC	0xdeadbeef
 

From 7bcebadda045bead18f6f7050af59af388c4507c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:53 +0000
Subject: [PATCH 129/134] kselftest/arm64: Add basic FPMR test

Verify that a FPMR frame is generated on systems that support FPMR and not
generated otherwise.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-8-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/signal/.gitignore |  1 +
 .../arm64/signal/testcases/fpmr_siginfo.c     | 82 +++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c

diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index 839e3a252629..1ce5b5eac386 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 mangle_*
 fake_sigreturn_*
+fpmr_*
 sme_*
 ssve_*
 sve_*
diff --git a/tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c
new file mode 100644
index 000000000000..e9d24685e741
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fpmr_siginfo.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ *
+ * Verify that the FPMR register context in signal frames is set up as
+ * expected.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+#include <asm/sigcontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 128];
+} context;
+
+#define SYS_FPMR "S3_3_C4_C4_2"
+
+static uint64_t get_fpmr(void)
+{
+	uint64_t val;
+
+	asm volatile (
+		"mrs	%0, " SYS_FPMR "\n"
+		: "=r"(val)
+		:
+		: "cc");
+
+	return val;
+}
+
+int fpmr_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct fpmr_context *fpmr_ctx;
+	size_t offset;
+	bool in_sigframe;
+	bool have_fpmr;
+	__u64 orig_fpmr;
+
+	have_fpmr = getauxval(AT_HWCAP2) & HWCAP2_FPMR;
+	if (have_fpmr)
+		orig_fpmr = get_fpmr();
+
+	if (!get_current_context(td, &context.uc, sizeof(context)))
+		return 1;
+
+	fpmr_ctx = (struct fpmr_context *)
+		get_header(head, FPMR_MAGIC, td->live_sz, &offset);
+
+	in_sigframe = fpmr_ctx != NULL;
+
+	fprintf(stderr, "FPMR sigframe %s on system %s FPMR\n",
+		in_sigframe ? "present" : "absent",
+		have_fpmr ? "with" : "without");
+
+	td->pass = (in_sigframe == have_fpmr);
+
+	if (have_fpmr && fpmr_ctx) {
+		if (fpmr_ctx->fpmr != orig_fpmr) {
+			fprintf(stderr, "FPMR in frame is %llx, was %llx\n",
+				fpmr_ctx->fpmr, orig_fpmr);
+			td->pass = false;
+		}
+	}
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "FPMR",
+	.descr = "Validate that FPMR is present as expected",
+	.timeout = 3,
+	.run = fpmr_present,
+};

From 44d10c27bd75b88b50d0c5cf2c3fe92933c39f1e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Mar 2024 23:14:54 +0000
Subject: [PATCH 130/134] kselftest/arm64: Add 2023 DPISA hwcap test coverage

Add the hwcaps added for the 2023 DPISA extensions to the hwcaps test
program.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240306-arm64-2023-dpisa-v5-9-c568edc8ed7f@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 217 ++++++++++++++++++++++
 1 file changed, 217 insertions(+)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 1189e77c8152..d8909b2b535a 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -58,11 +58,46 @@ static void cssc_sigill(void)
 	asm volatile(".inst 0xdac01c00" : : : "x0");
 }
 
+static void f8cvt_sigill(void)
+{
+	/* FSCALE V0.4H, V0.4H, V0.4H */
+	asm volatile(".inst 0x2ec03c00");
+}
+
+static void f8dp2_sigill(void)
+{
+	/* FDOT V0.4H, V0.4H, V0.5H */
+	asm volatile(".inst 0xe40fc00");
+}
+
+static void f8dp4_sigill(void)
+{
+	/* FDOT V0.2S, V0.2S, V0.2S */
+	asm volatile(".inst 0xe00fc00");
+}
+
+static void f8fma_sigill(void)
+{
+	/* FMLALB V0.8H, V0.16B, V0.16B */
+	asm volatile(".inst 0xec0fc00");
+}
+
+static void faminmax_sigill(void)
+{
+	/* FAMIN V0.4H, V0.4H, V0.4H */
+	asm volatile(".inst 0x2ec01c00");
+}
+
 static void fp_sigill(void)
 {
 	asm volatile("fmov s0, #1");
 }
 
+static void fpmr_sigill(void)
+{
+	asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
+}
+
 static void ilrcpc_sigill(void)
 {
 	/* LDAPUR W0, [SP, #8] */
@@ -95,6 +130,12 @@ static void lse128_sigill(void)
 		     : "cc", "memory");
 }
 
+static void lut_sigill(void)
+{
+	/* LUTI2 V0.16B, { V0.16B }, V[0] */
+	asm volatile(".inst 0x4e801000");
+}
+
 static void mops_sigill(void)
 {
 	char dst[1], src[1];
@@ -216,6 +257,78 @@ static void smef16f16_sigill(void)
 	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
 }
 
+static void smef8f16_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT ZA.H[W0, 0], Z0.B-Z1.B, Z0.B-Z1.B */
+	asm volatile(".inst 0xc1a01020" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smef8f32_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT ZA.S[W0, 0], { Z0.B-Z1.B }, Z0.B[0] */
+	asm volatile(".inst 0xc1500038" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smelutv2_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* LUTI4 { Z0.B-Z3.B }, ZT0, { Z0-Z1 } */
+	asm volatile(".inst 0xc08b0000" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesf8dp2_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT Z0.H, Z0.B, Z0.B[0] */
+	asm volatile(".inst 0x64204400" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesf8dp4_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FDOT Z0.S, Z0.B, Z0.B[0] */
+	asm volatile(".inst 0xc1a41C00" : : : );
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
+static void smesf8fma_sigill(void)
+{
+	/* SMSTART */
+	asm volatile("msr S0_3_C4_C7_3, xzr" : : : );
+
+	/* FMLALB V0.8H, V0.16B, V0.16B */
+	asm volatile(".inst 0xec0fc00");
+
+	/* SMSTOP */
+	asm volatile("msr S0_3_C4_C6_3, xzr" : : : );
+}
+
 static void sve_sigill(void)
 {
 	/* RDVL x0, #0 */
@@ -353,6 +466,53 @@ static const struct hwcap_data {
 		.cpuinfo = "cssc",
 		.sigill_fn = cssc_sigill,
 	},
+	{
+		.name = "F8CVT",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8CVT,
+		.cpuinfo = "f8cvt",
+		.sigill_fn = f8cvt_sigill,
+	},
+	{
+		.name = "F8DP4",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8DP4,
+		.cpuinfo = "f8dp4",
+		.sigill_fn = f8dp4_sigill,
+	},
+	{
+		.name = "F8DP2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8DP2,
+		.cpuinfo = "f8dp4",
+		.sigill_fn = f8dp2_sigill,
+	},
+	{
+		.name = "F8E5M2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8E5M2,
+		.cpuinfo = "f8e5m2",
+	},
+	{
+		.name = "F8E4M3",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8E4M3,
+		.cpuinfo = "f8e4m3",
+	},
+	{
+		.name = "F8FMA",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_F8FMA,
+		.cpuinfo = "f8fma",
+		.sigill_fn = f8fma_sigill,
+	},
+	{
+		.name = "FAMINMAX",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_FAMINMAX,
+		.cpuinfo = "faminmax",
+		.sigill_fn = faminmax_sigill,
+	},
 	{
 		.name = "FP",
 		.at_hwcap = AT_HWCAP,
@@ -360,6 +520,14 @@ static const struct hwcap_data {
 		.cpuinfo = "fp",
 		.sigill_fn = fp_sigill,
 	},
+	{
+		.name = "FPMR",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_FPMR,
+		.cpuinfo = "fpmr",
+		.sigill_fn = fpmr_sigill,
+		.sigill_reliable = true,
+	},
 	{
 		.name = "JSCVT",
 		.at_hwcap = AT_HWCAP,
@@ -411,6 +579,13 @@ static const struct hwcap_data {
 		.cpuinfo = "lse128",
 		.sigill_fn = lse128_sigill,
 	},
+	{
+		.name = "LUT",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_LUT,
+		.cpuinfo = "lut",
+		.sigill_fn = lut_sigill,
+	},
 	{
 		.name = "MOPS",
 		.at_hwcap = AT_HWCAP2,
@@ -511,6 +686,48 @@ static const struct hwcap_data {
 		.cpuinfo = "smef16f16",
 		.sigill_fn = smef16f16_sigill,
 	},
+	{
+		.name = "SME F8F16",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_F8F16,
+		.cpuinfo = "smef8f16",
+		.sigill_fn = smef8f16_sigill,
+	},
+	{
+		.name = "SME F8F32",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_F8F32,
+		.cpuinfo = "smef8f32",
+		.sigill_fn = smef8f32_sigill,
+	},
+	{
+		.name = "SME LUTV2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_LUTV2,
+		.cpuinfo = "smelutv2",
+		.sigill_fn = smelutv2_sigill,
+	},
+	{
+		.name = "SME SF8FMA",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_SF8FMA,
+		.cpuinfo = "smesf8fma",
+		.sigill_fn = smesf8fma_sigill,
+	},
+	{
+		.name = "SME SF8DP2",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_SF8DP2,
+		.cpuinfo = "smesf8dp2",
+		.sigill_fn = smesf8dp2_sigill,
+	},
+	{
+		.name = "SME SF8DP4",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SME_SF8DP4,
+		.cpuinfo = "smesf8dp4",
+		.sigill_fn = smesf8dp4_sigill,
+	},
 	{
 		.name = "SVE",
 		.at_hwcap = AT_HWCAP,

From 0499a78369adacec1af29340b71ff8dd375b4697 Mon Sep 17 00:00:00 2001
From: "Christoph Lameter (Ampere)" <cl@gentwo.org>
Date: Wed, 6 Mar 2024 17:45:04 -0800
Subject: [PATCH 131/134] ARM64: Dynamically allocate cpumasks and increase
 supported CPUs to 512

Currently defconfig selects NR_CPUS=256, but some vendors (e.g. Ampere
Computing) are planning to ship systems with 512 CPUs. So that all CPUs on
these systems can be used with defconfig, we'd like to bump NR_CPUS to 512.
Therefore this patch increases the default NR_CPUS from 256 to 512.

As increasing NR_CPUS will increase the size of cpumasks, there's a fear that
this might have a significant impact on stack usage due to code which places
cpumasks on the stack. To mitigate that concern, we can select
CPUMASK_OFFSTACK. As that doesn't seem to be a problem today with
NR_CPUS=256, we only select this when NR_CPUS > 256.

CPUMASK_OFFSTACK configures the cpumasks in the kernel to be
dynamically allocated. This was used in the X86 architecture in the
past to enable support for larger CPU configurations up to 8k cpus.

With that is becomes possible to dynamically size the allocation of
the cpu bitmaps depending on the quantity of processors detected on
bootup. Memory used for cpumasks will increase if the kernel is
run on a machine with more cores.

Further increases may be needed if ARM processor vendors start
supporting more processors. Given the current inflationary trends
in core counts from multiple processor manufacturers this may occur.

There are minor regressions for hackbench. The kernel data size
for 512 cpus is smaller with offstack than with onstack.

Benchmark results using hackbench average over 10 runs of

 	hackbench -s 512 -l 2000 -g 15 -f 25 -P

on Altra 80 Core

Support for 256 CPUs on stack. Baseline

 	7.8564 sec

Support for 512 CUs on stack.

 	7.8713 sec + 0.18%

512 CPUS offstack

 	7.8916 sec + 0.44%

Kernel size comparison:

    text		   data	    filename				Difference to onstack256 baseline
25755648	9589248	    vmlinuz-6.8.0-rc4-onstack256
25755648	9607680	    vmlinuz-6.8.0-rc4-onstack512	+0.19%
25755648	9603584	    vmlinuz-6.8.0-rc4-offstack512	+0.14%

Tested-by: Eric Mackay <eric.mackay@oracle.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Christoph Lameter (Ampere) <cl@linux.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/37099a57-b655-3b3a-56d0-5f7fbd49d7db@gentwo.org
[catalin.marinas@arm.com: use 'select' instead of duplicating 'config CPUMASK_OFFSTACK']
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b73d702e1e30..cdb307584d31 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -120,6 +120,7 @@ config ARM64
 	select CLONE_BACKWARDS
 	select COMMON_CLK
 	select CPU_PM if (SUSPEND || CPU_IDLE)
+	select CPUMASK_OFFSTACK if NR_CPUS > 256
 	select CRC32
 	select DCACHE_WORD_ACCESS
 	select DYNAMIC_FTRACE if FUNCTION_TRACER
@@ -1424,7 +1425,7 @@ config SCHED_SMT
 config NR_CPUS
 	int "Maximum number of CPUs (2-4096)"
 	range 2 4096
-	default "256"
+	default "512"
 
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"

From f1bbc4e9cfa4c0f29883171e9d01c01cbe94becc Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 11 Mar 2024 18:40:49 +0000
Subject: [PATCH 132/134] Revert "ARM64: Dynamically allocate cpumasks and
 increase supported CPUs to 512"

This reverts commit 0499a78369adacec1af29340b71ff8dd375b4697.

Enabling CPUMASK_OFFSTACK on arm64 triggers a warning in the
dev_pm_opp_set_config() function followed by a failure to set the
regulators and cpufreq-dt probing error. There is no apparent reason why
this happens, so revert this commit until further investigation.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/c1f2902d-cefc-4122-9b86-d1d32911f590@samsung.com
---
 arch/arm64/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6daceefcedfc..24dfd87fab93 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -120,7 +120,6 @@ config ARM64
 	select CLONE_BACKWARDS
 	select COMMON_CLK
 	select CPU_PM if (SUSPEND || CPU_IDLE)
-	select CPUMASK_OFFSTACK if NR_CPUS > 256
 	select CRC32
 	select DCACHE_WORD_ACCESS
 	select DYNAMIC_FTRACE if FUNCTION_TRACER
@@ -1431,7 +1430,7 @@ config SCHED_SMT
 config NR_CPUS
 	int "Maximum number of CPUs (2-4096)"
 	range 2 4096
-	default "512"
+	default "256"
 
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"

From 69ebc0182406541f0be0f086cdfff13ac56e7385 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 12 Mar 2024 20:00:20 +0000
Subject: [PATCH 133/134] Revert "arm64: mm: add support for WXN memory
 translation attribute"

This reverts commit 50e3ed0f93f4f62ed2aa83de5db6cb84ecdd5707.

The SCTLR_EL1.WXN control forces execute-never when a page has write
permissions. While the idea of hardening such write/exec combinations is
good, with permissions indirection enabled (FEAT_PIE) this control
becomes RES0. FEAT_PIE introduces a slightly different form of WXN which
only has an effect when the base permission is RWX and the write is
toggled by the permission overlay (FEAT_POE, not yet supported by the
arm64 kernel). Revert the patch for now.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/ZfGESD3a91lxH367@arm.com
---
 arch/arm64/Kconfig                    | 11 --------
 arch/arm64/include/asm/cpufeature.h   |  8 ------
 arch/arm64/include/asm/mman.h         | 36 ---------------------------
 arch/arm64/include/asm/mmu_context.h  | 30 +---------------------
 arch/arm64/kernel/pi/idreg-override.c |  4 +--
 arch/arm64/kernel/pi/map_kernel.c     | 23 -----------------
 arch/arm64/mm/proc.S                  |  6 -----
 7 files changed, 2 insertions(+), 116 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 24dfd87fab93..4869265ace2d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1606,17 +1606,6 @@ config RODATA_FULL_DEFAULT_ENABLED
 	  This requires the linear region to be mapped down to pages,
 	  which may adversely affect performance in some cases.
 
-config ARM64_WXN
-	bool "Enable WXN attribute so all writable mappings are non-exec"
-	help
-	  Set the WXN bit in the SCTLR system register so that all writable
-	  mappings are treated as if the PXN/UXN bit is set as well.
-	  If this is set to Y, it can still be disabled at runtime by
-	  passing 'arm64.nowxn' on the kernel command line.
-
-	  This should only be set if no software needs to be supported that
-	  relies on being able to execute from writable mappings.
-
 config ARM64_SW_TTBR0_PAN
 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
 	help
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 66ba0801f7b7..6d86ad37c615 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -18,7 +18,6 @@
 #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR	0
 #define ARM64_SW_FEATURE_OVERRIDE_HVHE		4
 #define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF	8
-#define ARM64_SW_FEATURE_OVERRIDE_NOWXN		12
 
 #ifndef __ASSEMBLY__
 
@@ -968,13 +967,6 @@ static inline bool kaslr_disabled_cmdline(void)
 	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR);
 }
 
-static inline bool arm64_wxn_enabled(void)
-{
-	if (!IS_ENABLED(CONFIG_ARM64_WXN))
-		return false;
-	return !arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOWXN);
-}
-
 u32 get_kvm_ipa_limit(void);
 void dump_cpu_features(void);
 
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 6d4940342ba7..5966ee4a6154 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -35,40 +35,11 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
 }
 #define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
 
-static inline bool arm64_check_wx_prot(unsigned long prot,
-				       struct task_struct *tsk)
-{
-	/*
-	 * When we are running with SCTLR_ELx.WXN==1, writable mappings are
-	 * implicitly non-executable. This means we should reject such mappings
-	 * when user space attempts to create them using mmap() or mprotect().
-	 */
-	if (arm64_wxn_enabled() &&
-	    ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))) {
-		/*
-		 * User space libraries such as libffi carry elaborate
-		 * heuristics to decide whether it is worth it to even attempt
-		 * to create writable executable mappings, as PaX or selinux
-		 * enabled systems will outright reject it. They will usually
-		 * fall back to something else (e.g., two separate shared
-		 * mmap()s of a temporary file) on failure.
-		 */
-		pr_info_ratelimited(
-			"process %s (%d) attempted to create PROT_WRITE+PROT_EXEC mapping\n",
-			tsk->comm, tsk->pid);
-		return false;
-	}
-	return true;
-}
-
 static inline bool arch_validate_prot(unsigned long prot,
 	unsigned long addr __always_unused)
 {
 	unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM;
 
-	if (!arm64_check_wx_prot(prot, current))
-		return false;
-
 	if (system_supports_bti())
 		supported |= PROT_BTI;
 
@@ -79,13 +50,6 @@ static inline bool arch_validate_prot(unsigned long prot,
 }
 #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
 
-static inline bool arch_validate_mmap_prot(unsigned long prot,
-					   unsigned long addr)
-{
-	return arm64_check_wx_prot(prot, current);
-}
-#define arch_validate_mmap_prot arch_validate_mmap_prot
-
 static inline bool arch_validate_flags(unsigned long vm_flags)
 {
 	if (!system_supports_mte())
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index f0fe2d09d139..c768d16b81a4 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -20,41 +20,13 @@
 #include <asm/cpufeature.h>
 #include <asm/daifflags.h>
 #include <asm/proc-fns.h>
+#include <asm-generic/mm_hooks.h>
 #include <asm/cputype.h>
 #include <asm/sysreg.h>
 #include <asm/tlbflush.h>
 
 extern bool rodata_full;
 
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
-				struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void arch_exit_mmap(struct mm_struct *mm)
-{
-}
-
-static inline void arch_unmap(struct mm_struct *mm,
-			unsigned long start, unsigned long end)
-{
-}
-
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
-		bool write, bool execute, bool foreign)
-{
-	if (IS_ENABLED(CONFIG_ARM64_WXN) && execute &&
-	    (vma->vm_flags & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC)) {
-		pr_warn_ratelimited(
-			"process %s (%d) attempted to execute from writable memory\n",
-			current->comm, current->pid);
-		/* disallow unless the nowxn override is set */
-		return !arm64_wxn_enabled();
-	}
-	return true;
-}
-
 static inline void contextidr_thread_switch(struct task_struct *next)
 {
 	if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index bccfee34f62f..aad399796e81 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -189,7 +189,6 @@ static const struct ftr_set_desc sw_features __prel64_initconst = {
 		FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL),
 		FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter),
 		FIELD("rodataoff", ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF, NULL),
-		FIELD("nowxn", ARM64_SW_FEATURE_OVERRIDE_NOWXN, NULL),
 		{}
 	},
 };
@@ -222,9 +221,8 @@ static const struct {
 	{ "arm64.nomops",		"id_aa64isar2.mops=0" },
 	{ "arm64.nomte",		"id_aa64pfr1.mte=0" },
 	{ "nokaslr",			"arm64_sw.nokaslr=1" },
-	{ "rodata=off",			"arm64_sw.rodataoff=1 arm64_sw.nowxn=1" },
+	{ "rodata=off",			"arm64_sw.rodataoff=1" },
 	{ "arm64.nolva",		"id_aa64mmfr2.varange=0" },
-	{ "arm64.nowxn",		"arm64_sw.nowxn=1" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index cac1e1f63c44..5fa08e13e17e 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -132,25 +132,6 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
 	idmap_cpu_replace_ttbr1(swapper_pg_dir);
 }
 
-static void noinline __section(".idmap.text") disable_wxn(void)
-{
-	u64 sctlr = read_sysreg(sctlr_el1) & ~SCTLR_ELx_WXN;
-
-	/*
-	 * We cannot safely clear the WXN bit while the MMU and caches are on,
-	 * so turn the MMU off, flush the TLBs and turn it on again but with
-	 * the WXN bit cleared this time.
-	 */
-	asm("	msr	sctlr_el1, %0		;"
-	    "	isb				;"
-	    "	tlbi    vmalle1			;"
-	    "	dsb     nsh			;"
-	    "	isb				;"
-	    "	msr     sctlr_el1, %1		;"
-	    "	isb				;"
-	    ::	"r"(sctlr & ~SCTLR_ELx_M), "r"(sctlr));
-}
-
 static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr)
 {
 	u64 sctlr = read_sysreg(sctlr_el1);
@@ -248,10 +229,6 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
 	if (va_bits > VA_BITS_MIN)
 		sysreg_clear_set(tcr_el1, TCR_T1SZ_MASK, TCR_T1SZ(va_bits));
 
-	if (IS_ENABLED(CONFIG_ARM64_WXN) &&
-	    arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOWXN))
-		disable_wxn();
-
 	/*
 	 * The virtual KASLR displacement modulo 2MiB is decided by the
 	 * physical placement of the image, as otherwise, we might not be able
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index bfd2ad896108..9d40f3ffd8d2 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -546,12 +546,6 @@ alternative_else_nop_endif
 	 * Prepare SCTLR
 	 */
 	mov_q	x0, INIT_SCTLR_EL1_MMU_ON
-#ifdef CONFIG_ARM64_WXN
-	ldr_l	x1, arm64_sw_feature_override + FTR_OVR_VAL_OFFSET
-	tst	x1, #0xf << ARM64_SW_FEATURE_OVERRIDE_NOWXN
-	orr	x1, x0, #SCTLR_ELx_WXN
-	csel	x0, x0, x1, ne
-#endif
 	ret					// return to head.S
 
 	.unreq	mair

From 1ef21fcd6a50f011680dbbd678c1bea8e3f67ab9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 12 Mar 2024 20:00:38 +0000
Subject: [PATCH 134/134] Revert "mm: add arch hook to validate mmap() prot
 flags"

This reverts commit cb1a393c40eee2f1692c995ea0cc6e45bfccde4d.

Since the arm64 WXN patch has been reverted, remove this hook as it
would not have any users.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/ZfGESD3a91lxH367@arm.com
---
 include/linux/mman.h | 15 ---------------
 mm/mmap.c            |  3 ---
 2 files changed, 18 deletions(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index ec5e7f606e43..dc7048824be8 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -124,21 +124,6 @@ static inline bool arch_validate_flags(unsigned long flags)
 #define arch_validate_flags arch_validate_flags
 #endif
 
-#ifndef arch_validate_mmap_prot
-/*
- * This is called from mmap(), which ignores unknown prot bits so the default
- * is to accept anything.
- *
- * Returns true if the prot flags are valid
- */
-static inline bool arch_validate_mmap_prot(unsigned long prot,
-					   unsigned long addr)
-{
-	return true;
-}
-#define arch_validate_mmap_prot arch_validate_mmap_prot
-#endif
-
 /*
  * Optimisation macro.  It is equivalent to:
  *      (x & bit1) ? bit2 : 0
diff --git a/mm/mmap.c b/mm/mmap.c
index 977a8c3fd9f5..d89770eaab6b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1229,9 +1229,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 		if (!(file && path_noexec(&file->f_path)))
 			prot |= PROT_EXEC;
 
-	if (!arch_validate_mmap_prot(prot, addr))
-		return -EACCES;
-
 	/* force arch specific MAP_FIXED handling in get_unmapped_area */
 	if (flags & MAP_FIXED_NOREPLACE)
 		flags |= MAP_FIXED;