x86 MM handling code changes for v6.7:

- Add new NX-stack self-test
  - Improve NUMA partial-CFMWS handling
  - Fix #VC handler bugs resulting in SEV-SNP boot failures
  - Drop the 4MB memory size restriction on minimal NUMA nodes
  - Reorganize headers a bit, in preparation to header dependency reduction efforts
  - Misc cleanups & fixes
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmU9Ek4RHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1gIJQ/+Mg6mzMaThyNXqhJszeZJBmDaBv2sqjAB
 5tcferg1nJBdNBzX8bJ95UFt9fIqeYAcgH00qlQCYSmyzbC1TQTk9U2Pre1zbOw4
 042ONK8sygKSje1zdYleHoBeqwnxD2VNM0NwBElhGjumwHRng/tbLiI9wx6qiz+C
 VsFXavkBszHGA1pjy9wZLGixYIH5jCygMpH134Wp+CIhpS+C4nftcGdIL1D5Oil1
 6Tm2XeI6uyfiQhm9IOwDjfoYeC7gUjx1rp8rHseGUMJxyO/BX9q5j1ixbsVriqfW
 97ucYuRL9mza7ic516C9v7OlAA3AGH2xWV+SYOGK88i9Co4kYzP4WnamxXqOsD8+
 popxG55oa6QelhaouTBZvgERpZ4fWupSDs/UccsDaE9leMCerNEbGHEzt/Mm/2sw
 xopjMQ0y5Kn6/fS0dLv8U+XHu4ANkvXJkFd6Ny0h/WfgGefuQOOTG9ruYgfeqqB8
 dViQ4R7CO8ySjD45KawAZl/EqL86x1M/CI1nlt0YY4vNwUuOJbebL7Jn8w3Fjxm5
 FVfUlDmcPdhZfL9Vnrsi6MIou1cU1yJPw4D6sXJ4sg4s7A4ebBcRRrjayVQ4msjv
 Q7cvBOMnWEHhOV11pvP50FmQuj74XW3bUqiuWrnK1SypvnhHavF6kc1XYpBLs1xZ
 y8nueJW2qPw=
 =tT5F
 -----END PGP SIGNATURE-----

Merge tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm handling updates from Ingo Molnar:

 - Add new NX-stack self-test

 - Improve NUMA partial-CFMWS handling

 - Fix #VC handler bugs resulting in SEV-SNP boot failures

 - Drop the 4MB memory size restriction on minimal NUMA nodes

 - Reorganize headers a bit, in preparation to header dependency
   reduction efforts

 - Misc cleanups & fixes

* tag 'x86-mm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm: Drop the 4 MB restriction on minimal NUMA node memory size
  selftests/x86/lam: Zero out buffer for readlink()
  x86/sev: Drop unneeded #include
  x86/sev: Move sev_setup_arch() to mem_encrypt.c
  x86/tdx: Replace deprecated strncpy() with strtomem_pad()
  selftests/x86/mm: Add new test that userspace stack is in fact NX
  x86/sev: Make boot_ghcb_page[] static
  x86/boot: Move x86_cache_alignment initialization to correct spot
  x86/sev-es: Set x86_virt_bits to the correct value straight away, instead of a two-phase approach
  x86/sev-es: Allow copy_from_kernel_nofault() in earlier boot
  x86_64: Show CR4.PSE on auxiliaries like on BSP
  x86/iommu/docs: Update AMD IOMMU specification document URL
  x86/sev/docs: Update document URL in amd-memory-encryption.rst
  x86/mm: Move arch_memory_failure() and arch_is_platform_page() definitions from <asm/processor.h> to <asm/pgtable.h>
  ACPI/NUMA: Apply SRAT proximity domain to entire CFMWS window
  x86/numa: Introduce numa_fill_memblks()
This commit is contained in:
Linus Torvalds 2023-10-30 15:40:57 -10:00
commit f0d25b5d0f
21 changed files with 404 additions and 95 deletions

View File

@ -130,4 +130,4 @@ SNP feature support.
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
[1] https://www.amd.com/system/files/TechDocs/40332.pdf
[1] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/24593.pdf

View File

@ -5,7 +5,7 @@ x86 IOMMU Support
The architecture specs can be obtained from the below locations.
- Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
- AMD: https://www.amd.com/system/files/TechDocs/48882_IOMMU.pdf
- AMD: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_3_07_PUB.pdf
This guide gives a quick cheat sheet for some basic understanding.

View File

@ -25,7 +25,7 @@
#include "error.h"
#include "../msr.h"
struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
/*

View File

@ -119,7 +119,7 @@ static void __noreturn tdx_panic(const char *msg)
} message;
/* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
strncpy(message.str, msg, 64);
strtomem_pad(message.str, msg, '\0');
args.r8 = message.r8;
args.r9 = message.r9;

View File

@ -19,8 +19,10 @@
#ifdef CONFIG_X86_MEM_ENCRYPT
void __init mem_encrypt_init(void);
void __init mem_encrypt_setup_arch(void);
#else
static inline void mem_encrypt_init(void) { }
static inline void __init mem_encrypt_setup_arch(void) { }
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
@ -43,7 +45,6 @@ void __init sme_map_bootdata(char *real_mode_data);
void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
void __init sev_setup_arch(void);
void __init sme_encrypt_kernel(struct boot_params *bp);
void __init sme_enable(struct boot_params *bp);
@ -73,7 +74,6 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { }
static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
static inline void __init sev_setup_arch(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }

View File

@ -12,13 +12,6 @@
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
/*
* Too small node sizes may confuse the VM badly. Usually they
* result from BIOS bugs. So dont recognize nodes as standalone
* NUMA entities that have less than this amount of RAM listed:
*/
#define NODE_MIN_SIZE (4*1024*1024)
extern int numa_off;
/*

View File

@ -1716,6 +1716,14 @@ static inline bool pud_user_accessible_page(pud_t pud)
}
#endif
#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure
bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_H */

View File

@ -724,14 +724,6 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure
bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif
extern bool gds_ucode_mitigated(void);
#endif /* _ASM_X86_PROCESSOR_H */

View File

@ -37,6 +37,8 @@ extern int phys_to_target_node(phys_addr_t start);
#define phys_to_target_node phys_to_target_node
extern int memory_add_physaddr_to_nid(u64 start);
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
extern int numa_fill_memblks(u64 start, u64 end);
#define numa_fill_memblks numa_fill_memblks
#endif
#endif /* __ASSEMBLY__ */

View File

@ -1115,18 +1115,34 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
bool vp_bits_from_cpuid = true;
if (c->extended_cpuid_level >= 0x80000008) {
if (!cpu_has(c, X86_FEATURE_CPUID) ||
(c->extended_cpuid_level < 0x80000008))
vp_bits_from_cpuid = false;
if (vp_bits_from_cpuid) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
} else {
if (IS_ENABLED(CONFIG_X86_64)) {
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
c->x86_virt_bits = 48;
} else {
c->x86_clflush_size = 32;
c->x86_virt_bits = 32;
c->x86_phys_bits = 32;
if (cpu_has(c, X86_FEATURE_PAE) ||
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
}
#ifdef CONFIG_X86_32
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
#endif
c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@ -1580,17 +1596,6 @@ static void __init cpu_parse_early_param(void)
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
c->x86_virt_bits = 48;
#else
c->x86_clflush_size = 32;
c->x86_phys_bits = 32;
c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
@ -1602,7 +1607,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
cpu_parse_early_param();
@ -1618,6 +1622,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_CPUID);
}
get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
cpu_set_bug_bits(c);

View File

@ -179,8 +179,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl $0, %ecx
#endif
/* Enable PAE mode, PGE and LA57 */
orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
/* Enable PAE mode, PSE, PGE and LA57 */
orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f

View File

@ -1120,7 +1120,7 @@ void __init setup_arch(char **cmdline_p)
* Needs to run after memblock setup because it needs the physical
* memory size.
*/
sev_setup_arch();
mem_encrypt_setup_arch();
efi_fake_memmap();
efi_find_mirror();

View File

@ -9,12 +9,21 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
unsigned long vaddr = (unsigned long)unsafe_src;
/*
* Range covering the highest possible canonical userspace address
* as well as non-canonical address range. For the canonical range
* we also need to include the userspace guard page.
* Do not allow userspace addresses. This disallows
* normal userspace and the userspace guard page:
*/
return vaddr >= TASK_SIZE_MAX + PAGE_SIZE &&
__is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
return false;
/*
* Allow everything during early boot before 'x86_virt_bits'
* is initialized. Needed for instruction decoding in early
* exception handlers.
*/
if (!boot_cpu_data.x86_virt_bits)
return true;
return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
}
#else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)

View File

@ -12,6 +12,7 @@
#include <linux/swiotlb.h>
#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
#include <linux/virtio_anchor.h>
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@ -86,3 +87,36 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
void __init mem_encrypt_setup_arch(void)
{
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
/*
* For SEV and TDX, all DMA has to occur via shared/unencrypted pages.
* Kernel uses SWIOTLB to make this happen without changing device
* drivers. However, depending on the workload being run, the
* default 64MB of SWIOTLB may not be enough and SWIOTLB may
* run out of buffers for DMA, resulting in I/O errors and/or
* performance degradation especially with high I/O workloads.
*
* Adjust the default size of SWIOTLB using a percentage of guest
* memory for SWIOTLB buffers. Also, as the SWIOTLB bounce buffer
* memory is allocated from low memory, ensure that the adjusted size
* is within the limits of low available memory.
*
* The percentage of guest memory used here for SWIOTLB buffers
* is more of an approximation of the static adjustment which
* 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
*/
size = total_mem * 6 / 100;
size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
swiotlb_adjust_size(size);
/* Set restricted memory access for virtio. */
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
}

View File

@ -19,8 +19,6 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
#include <linux/virtio_anchor.h>
#include <linux/cc_platform.h>
#include <asm/tlbflush.h>
@ -215,40 +213,6 @@ void __init sme_map_bootdata(char *real_mode_data)
__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
}
void __init sev_setup_arch(void)
{
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
/*
* For SEV, all DMA has to occur via shared/unencrypted pages.
* SEV uses SWIOTLB to make this happen without changing device
* drivers. However, depending on the workload being run, the
* default 64MB of SWIOTLB may not be enough and SWIOTLB may
* run out of buffers for DMA, resulting in I/O errors and/or
* performance degradation especially with high I/O workloads.
*
* Adjust the default size of SWIOTLB for SEV guests using
* a percentage of guest memory for SWIOTLB buffers.
* Also, as the SWIOTLB bounce buffer memory is allocated
* from low memory, ensure that the adjusted size is within
* the limits of low available memory.
*
* The percentage of guest memory used here for SWIOTLB buffers
* is more of an approximation of the static adjustment which
* 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
*/
size = total_mem * 6 / 100;
size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
swiotlb_adjust_size(size);
/* Set restricted memory access for virtio. */
virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
}
static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
{
unsigned long pfn = 0;

View File

@ -12,6 +12,7 @@
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/topology.h>
#include <linux/sort.h>
#include <asm/e820/api.h>
#include <asm/proto.h>
@ -602,13 +603,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
if (start >= end)
continue;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
if (end && (end - start) < NODE_MIN_SIZE)
continue;
alloc_node_data(nid);
}
@ -964,4 +958,83 @@ int memory_add_physaddr_to_nid(u64 start)
return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
static int __init cmp_memblk(const void *a, const void *b)
{
const struct numa_memblk *ma = *(const struct numa_memblk **)a;
const struct numa_memblk *mb = *(const struct numa_memblk **)b;
return ma->start - mb->start;
}
static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
/**
* numa_fill_memblks - Fill gaps in numa_meminfo memblks
* @start: address to begin fill
* @end: address to end fill
*
* Find and extend numa_meminfo memblks to cover the @start-@end
* physical address range, such that the first memblk includes
* @start, the last memblk includes @end, and any gaps in between
* are filled.
*
* RETURNS:
* 0 : Success
* NUMA_NO_MEMBLK : No memblk exists in @start-@end range
*/
int __init numa_fill_memblks(u64 start, u64 end)
{
struct numa_memblk **blk = &numa_memblk_list[0];
struct numa_meminfo *mi = &numa_meminfo;
int count = 0;
u64 prev_end;
/*
* Create a list of pointers to numa_meminfo memblks that
* overlap start, end. Exclude (start == bi->end) since
* end addresses in both a CFMWS range and a memblk range
* are exclusive.
*
* This list of pointers is used to make in-place changes
* that fill out the numa_meminfo memblks.
*/
for (int i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
if (start < bi->end && end >= bi->start) {
blk[count] = &mi->blk[i];
count++;
}
}
if (!count)
return NUMA_NO_MEMBLK;
/* Sort the list of pointers in memblk->start order */
sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
/* Make sure the first/last memblks include start/end */
blk[0]->start = min(blk[0]->start, start);
blk[count - 1]->end = max(blk[count - 1]->end, end);
/*
* Fill any gaps by tracking the previous memblks
* end address and backfilling to it if needed.
*/
prev_end = blk[0]->end;
for (int i = 1; i < count; i++) {
struct numa_memblk *curr = blk[i];
if (prev_end >= curr->start) {
if (prev_end < curr->end)
prev_end = curr->end;
} else {
curr->start = prev_end;
prev_end = curr->end;
}
}
return 0;
}
#endif

View File

@ -310,11 +310,16 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
start = cfmws->base_hpa;
end = cfmws->base_hpa + cfmws->window_size;
/* Skip if the SRAT already described the NUMA details for this HPA */
node = phys_to_target_node(start);
if (node != NUMA_NO_NODE)
/*
* The SRAT may have already described NUMA details for all,
* or a portion of, this CFMWS HPA range. Extend the memblks
* found for any portion of the window to cover the entire
* window.
*/
if (!numa_fill_memblks(start, end))
return 0;
/* No SRAT description. Create a new node. */
node = acpi_map_pxm_to_node(*fake_pxm);
if (node == NUMA_NO_NODE) {

View File

@ -12,6 +12,7 @@
#define MAX_NUMNODES (1 << NODES_SHIFT)
#define NUMA_NO_NODE (-1)
#define NUMA_NO_MEMBLK (-1)
/* optionally keep NUMA memory info available post init */
#ifdef CONFIG_NUMA_KEEP_MEMINFO
@ -43,6 +44,12 @@ static inline int phys_to_target_node(u64 start)
return 0;
}
#endif
#ifndef numa_fill_memblks
static inline int __init numa_fill_memblks(u64 start, u64 end)
{
return NUMA_NO_MEMBLK;
}
#endif
#else /* !CONFIG_NUMA */
static inline int numa_nearest_node(int node, unsigned int state)
{

View File

@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
syscall_arg_fault fsgsbase_restore sigaltstack
TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
# state.
$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack
$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack

View File

@ -573,7 +573,7 @@ int do_uring(unsigned long lam)
char path[PATH_MAX] = {0};
/* get current process path */
if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
return 1;
int file_fd = open(path, O_RDONLY);
@ -680,14 +680,14 @@ static int handle_execve(struct testcases *test)
perror("Fork failed.");
ret = 1;
} else if (pid == 0) {
char path[PATH_MAX];
char path[PATH_MAX] = {0};
/* Set LAM mode in parent process */
if (set_lam(lam) != 0)
return 1;
/* Get current binary's path and the binary was run by execve */
if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
if (readlink("/proc/self/exe", path, PATH_MAX - 1) <= 0)
exit(-1);
/* run binary to get LAM mode and return to parent process */

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Test that userspace stack is NX. Requires linking with -Wl,-z,noexecstack
* because I don't want to bother with PT_GNU_STACK detection.
*
* Fill the stack with INT3's and then try to execute some of them:
* SIGSEGV -- good, SIGTRAP -- bad.
*
* Regular stack is completely overwritten before testing.
* Test doesn't exit SIGSEGV handler after first fault at INT3.
*/
#undef _GNU_SOURCE
#define _GNU_SOURCE
#undef NDEBUG
#include <assert.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <unistd.h>
#define PAGE_SIZE 4096
/*
* This is memset(rsp, 0xcc, -1); but down.
* It will SIGSEGV when bottom of the stack is reached.
* Byte-size access is important! (see rdi tweak in the signal handler).
*/
void make_stack1(void);
asm(
".pushsection .text\n"
".globl make_stack1\n"
".align 16\n"
"make_stack1:\n"
"mov $0xcc, %al\n"
#if defined __amd64__
"mov %rsp, %rdi\n"
"mov $-1, %rcx\n"
#elif defined __i386__
"mov %esp, %edi\n"
"mov $-1, %ecx\n"
#else
#error
#endif
"std\n"
"rep stosb\n"
/* unreachable */
"hlt\n"
".type make_stack1,@function\n"
".size make_stack1,.-make_stack1\n"
".popsection\n"
);
/*
* memset(p, 0xcc, -1);
* It will SIGSEGV when top of the stack is reached.
*/
void make_stack2(uint64_t p);
asm(
".pushsection .text\n"
".globl make_stack2\n"
".align 16\n"
"make_stack2:\n"
"mov $0xcc, %al\n"
#if defined __amd64__
"mov $-1, %rcx\n"
#elif defined __i386__
"mov $-1, %ecx\n"
#else
#error
#endif
"cld\n"
"rep stosb\n"
/* unreachable */
"hlt\n"
".type make_stack2,@function\n"
".size make_stack2,.-make_stack2\n"
".popsection\n"
);
static volatile int test_state = 0;
static volatile unsigned long stack_min_addr;
#if defined __amd64__
#define RDI REG_RDI
#define RIP REG_RIP
#define RIP_STRING "rip"
#elif defined __i386__
#define RDI REG_EDI
#define RIP REG_EIP
#define RIP_STRING "eip"
#else
#error
#endif
static void sigsegv(int _, siginfo_t *__, void *uc_)
{
/*
* Some Linux versions didn't clear DF before entering signal
* handler. make_stack1() doesn't have a chance to clear DF
* either so we clear it by hand here.
*/
asm volatile ("cld" ::: "memory");
ucontext_t *uc = uc_;
if (test_state == 0) {
/* Stack is faulted and cleared from RSP to the lowest address. */
stack_min_addr = ++uc->uc_mcontext.gregs[RDI];
if (1) {
printf("stack min %lx\n", stack_min_addr);
}
uc->uc_mcontext.gregs[RIP] = (uintptr_t)&make_stack2;
test_state = 1;
} else if (test_state == 1) {
/* Stack has been cleared from top to bottom. */
unsigned long stack_max_addr = uc->uc_mcontext.gregs[RDI];
if (1) {
printf("stack max %lx\n", stack_max_addr);
}
/* Start faulting pages on stack and see what happens. */
uc->uc_mcontext.gregs[RIP] = stack_max_addr - PAGE_SIZE;
test_state = 2;
} else if (test_state == 2) {
/* Stack page is NX -- good, test next page. */
uc->uc_mcontext.gregs[RIP] -= PAGE_SIZE;
if (uc->uc_mcontext.gregs[RIP] == stack_min_addr) {
/* One more SIGSEGV and test ends. */
test_state = 3;
}
} else {
printf("PASS\tAll stack pages are NX\n");
_exit(EXIT_SUCCESS);
}
}
static void sigtrap(int _, siginfo_t *__, void *uc_)
{
const ucontext_t *uc = uc_;
unsigned long rip = uc->uc_mcontext.gregs[RIP];
printf("FAIL\texecutable page on the stack: " RIP_STRING " %lx\n", rip);
_exit(EXIT_FAILURE);
}
int main(void)
{
{
struct sigaction act = {};
sigemptyset(&act.sa_mask);
act.sa_flags = SA_SIGINFO;
act.sa_sigaction = &sigsegv;
int rv = sigaction(SIGSEGV, &act, NULL);
assert(rv == 0);
}
{
struct sigaction act = {};
sigemptyset(&act.sa_mask);
act.sa_flags = SA_SIGINFO;
act.sa_sigaction = &sigtrap;
int rv = sigaction(SIGTRAP, &act, NULL);
assert(rv == 0);
}
{
struct rlimit rlim;
int rv = getrlimit(RLIMIT_STACK, &rlim);
assert(rv == 0);
/* Cap stack at time-honored 8 MiB value. */
rlim.rlim_max = rlim.rlim_cur;
if (rlim.rlim_max > 8 * 1024 * 1024) {
rlim.rlim_max = 8 * 1024 * 1024;
}
rv = setrlimit(RLIMIT_STACK, &rlim);
assert(rv == 0);
}
{
/*
* We don't know now much stack SIGSEGV handler uses.
* Bump this by 1 page every time someone complains,
* or rewrite it in assembly.
*/
const size_t len = SIGSTKSZ;
void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
assert(p != MAP_FAILED);
stack_t ss = {};
ss.ss_sp = p;
ss.ss_size = len;
int rv = sigaltstack(&ss, NULL);
assert(rv == 0);
}
make_stack1();
/*
* Unreachable, but if _this_ INT3 is ever reached, it's a bug somewhere.
* Fold it into main SIGTRAP pathway.
*/
__builtin_trap();
}