linux-stable/tools/testing/selftests/mm/uffd-common.c
Axel Rasmussen 7cf0f9e837 selftests/mm: refactor uffd_poll_thread to allow custom fault handlers
Previously, we had "one fault handler to rule them all", which used
several branches to deal with all of the scenarios required by all of the
various tests.

In upcoming patches, I plan to add a new test, which has its own slightly
different fault handling logic.  Instead of continuing to add cruft to the
existing fault handler, let's allow tests to define custom ones, separate
from other tests.

Link: https://lkml.kernel.org/r/20230707215540.2324998-8-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gaosheng Cui <cuigaosheng1@huawei.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Cc: Jiaqi Yan <jiaqiyan@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: T.J. Alumbaugh <talumbau@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-18 10:12:17 -07:00

680 lines
17 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Userfaultfd tests util functions
*
* Copyright (C) 2015-2023 Red Hat, Inc.
*/
#include "uffd-common.h"
#define BASE_PMD_ADDR ((void *)(1UL << 30))
volatile bool test_uffdio_copy_eexist = true;
unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
int uffd = -1, uffd_flags, finished, *pipefd, test_type;
bool map_shared;
bool test_uffdio_wp = true;
unsigned long long *count_verify;
uffd_test_ops_t *uffd_test_ops;
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
{
unsigned int memfd_flags = 0;
int mem_fd;
if (hugetlb)
memfd_flags = MFD_HUGETLB;
mem_fd = memfd_create("uffd-test", memfd_flags);
if (mem_fd < 0)
err("memfd_create");
if (ftruncate(mem_fd, mem_size))
err("ftruncate");
if (fallocate(mem_fd,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
mem_size))
err("fallocate");
return mem_fd;
}
static void anon_release_pages(char *rel_area)
{
if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
err("madvise(MADV_DONTNEED) failed");
}
static int anon_allocate_area(void **alloc_area, bool is_src)
{
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (*alloc_area == MAP_FAILED) {
*alloc_area = NULL;
return -errno;
}
return 0;
}
static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
{
}
static void hugetlb_release_pages(char *rel_area)
{
if (!map_shared) {
if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
err("madvise(MADV_DONTNEED) failed");
} else {
if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
err("madvise(MADV_REMOVE) failed");
}
}
static int hugetlb_allocate_area(void **alloc_area, bool is_src)
{
off_t size = nr_pages * page_size;
off_t offset = is_src ? 0 : size;
void *area_alias = NULL;
char **alloc_area_alias;
int mem_fd = uffd_mem_fd_create(size * 2, true);
*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
(map_shared ? MAP_SHARED : MAP_PRIVATE) |
(is_src ? 0 : MAP_NORESERVE),
mem_fd, offset);
if (*alloc_area == MAP_FAILED) {
*alloc_area = NULL;
return -errno;
}
if (map_shared) {
area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED, mem_fd, offset);
if (area_alias == MAP_FAILED)
return -errno;
}
if (is_src) {
alloc_area_alias = &area_src_alias;
} else {
alloc_area_alias = &area_dst_alias;
}
if (area_alias)
*alloc_area_alias = area_alias;
close(mem_fd);
return 0;
}
static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
{
if (!map_shared)
return;
*start = (unsigned long) area_dst_alias + offset;
}
static void shmem_release_pages(char *rel_area)
{
if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
err("madvise(MADV_REMOVE) failed");
}
static int shmem_allocate_area(void **alloc_area, bool is_src)
{
void *area_alias = NULL;
size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
unsigned long offset = is_src ? 0 : bytes;
char *p = NULL, *p_alias = NULL;
int mem_fd = uffd_mem_fd_create(bytes * 2, false);
/* TODO: clean this up. Use a static addr is ugly */
p = BASE_PMD_ADDR;
if (!is_src)
/* src map + alias + interleaved hpages */
p += 2 * (bytes + hpage_size);
p_alias = p;
p_alias += bytes;
p_alias += hpage_size; /* Prevent src/dst VMA merge */
*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
mem_fd, offset);
if (*alloc_area == MAP_FAILED) {
*alloc_area = NULL;
return -errno;
}
if (*alloc_area != p)
err("mmap of memfd failed at %p", p);
area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
mem_fd, offset);
if (area_alias == MAP_FAILED) {
munmap(*alloc_area, bytes);
*alloc_area = NULL;
return -errno;
}
if (area_alias != p_alias)
err("mmap of anonymous memory failed at %p", p_alias);
if (is_src)
area_src_alias = area_alias;
else
area_dst_alias = area_alias;
close(mem_fd);
return 0;
}
static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
{
*start = (unsigned long)area_dst_alias + offset;
}
static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
{
if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
read_pmd_pagesize()))
err("Did not find expected %d number of hugepages",
expect_nr_hpages);
}
struct uffd_test_ops anon_uffd_test_ops = {
.allocate_area = anon_allocate_area,
.release_pages = anon_release_pages,
.alias_mapping = noop_alias_mapping,
.check_pmd_mapping = NULL,
};
struct uffd_test_ops shmem_uffd_test_ops = {
.allocate_area = shmem_allocate_area,
.release_pages = shmem_release_pages,
.alias_mapping = shmem_alias_mapping,
.check_pmd_mapping = shmem_check_pmd_mapping,
};
struct uffd_test_ops hugetlb_uffd_test_ops = {
.allocate_area = hugetlb_allocate_area,
.release_pages = hugetlb_release_pages,
.alias_mapping = hugetlb_alias_mapping,
.check_pmd_mapping = NULL,
};
void uffd_stats_report(struct uffd_args *args, int n_cpus)
{
int i;
unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
for (i = 0; i < n_cpus; i++) {
miss_total += args[i].missing_faults;
wp_total += args[i].wp_faults;
minor_total += args[i].minor_faults;
}
printf("userfaults: ");
if (miss_total) {
printf("%llu missing (", miss_total);
for (i = 0; i < n_cpus; i++)
printf("%lu+", args[i].missing_faults);
printf("\b) ");
}
if (wp_total) {
printf("%llu wp (", wp_total);
for (i = 0; i < n_cpus; i++)
printf("%lu+", args[i].wp_faults);
printf("\b) ");
}
if (minor_total) {
printf("%llu minor (", minor_total);
for (i = 0; i < n_cpus; i++)
printf("%lu+", args[i].minor_faults);
printf("\b)");
}
printf("\n");
}
int userfaultfd_open(uint64_t *features)
{
struct uffdio_api uffdio_api;
uffd = uffd_open(UFFD_FLAGS);
if (uffd < 0)
return -1;
uffd_flags = fcntl(uffd, F_GETFD, NULL);
uffdio_api.api = UFFD_API;
uffdio_api.features = *features;
if (ioctl(uffd, UFFDIO_API, &uffdio_api))
/* Probably lack of CAP_PTRACE? */
return -1;
if (uffdio_api.api != UFFD_API)
err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
*features = uffdio_api.features;
return 0;
}
static inline void munmap_area(void **area)
{
if (*area)
if (munmap(*area, nr_pages * page_size))
err("munmap");
*area = NULL;
}
static void uffd_test_ctx_clear(void)
{
size_t i;
if (pipefd) {
for (i = 0; i < nr_cpus * 2; ++i) {
if (close(pipefd[i]))
err("close pipefd");
}
free(pipefd);
pipefd = NULL;
}
if (count_verify) {
free(count_verify);
count_verify = NULL;
}
if (uffd != -1) {
if (close(uffd))
err("close uffd");
uffd = -1;
}
munmap_area((void **)&area_src);
munmap_area((void **)&area_src_alias);
munmap_area((void **)&area_dst);
munmap_area((void **)&area_dst_alias);
munmap_area((void **)&area_remap);
}
int uffd_test_ctx_init(uint64_t features, const char **errmsg)
{
unsigned long nr, cpu;
int ret;
uffd_test_ctx_clear();
ret = uffd_test_ops->allocate_area((void **)&area_src, true);
ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
if (ret) {
if (errmsg)
*errmsg = "memory allocation failed";
return ret;
}
ret = userfaultfd_open(&features);
if (ret) {
if (errmsg)
*errmsg = "possible lack of priviledge";
return ret;
}
count_verify = malloc(nr_pages * sizeof(unsigned long long));
if (!count_verify)
err("count_verify");
for (nr = 0; nr < nr_pages; nr++) {
*area_mutex(area_src, nr) =
(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
count_verify[nr] = *area_count(area_src, nr) = 1;
/*
* In the transition between 255 to 256, powerpc will
* read out of order in my_bcmp and see both bytes as
* zero, so leave a placeholder below always non-zero
* after the count, to avoid my_bcmp to trigger false
* positives.
*/
*(area_count(area_src, nr) + 1) = 1;
}
/*
* After initialization of area_src, we must explicitly release pages
* for area_dst to make sure it's fully empty. Otherwise we could have
* some area_dst pages be errornously initialized with zero pages,
* hence we could hit memory corruption later in the test.
*
* One example is when THP is globally enabled, above allocate_area()
* calls could have the two areas merged into a single VMA (as they
* will have the same VMA flags so they're mergeable). When we
* initialize the area_src above, it's possible that some part of
* area_dst could have been faulted in via one huge THP that will be
* shared between area_src and area_dst. It could cause some of the
* area_dst won't be trapped by missing userfaults.
*
* This release_pages() will guarantee even if that happened, we'll
* proactively split the thp and drop any accidentally initialized
* pages within area_dst.
*/
uffd_test_ops->release_pages(area_dst);
pipefd = malloc(sizeof(int) * nr_cpus * 2);
if (!pipefd)
err("pipefd");
for (cpu = 0; cpu < nr_cpus; cpu++)
if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
err("pipe");
return 0;
}
void wp_range(int ufd, __u64 start, __u64 len, bool wp)
{
struct uffdio_writeprotect prms;
/* Write protection page faults */
prms.range.start = start;
prms.range.len = len;
/* Undo write-protect, do wakeup after that */
prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
}
static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
{
struct uffdio_continue req;
int ret;
req.range.start = start;
req.range.len = len;
req.mode = 0;
if (wp)
req.mode |= UFFDIO_CONTINUE_MODE_WP;
if (ioctl(ufd, UFFDIO_CONTINUE, &req))
err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
(uint64_t)start);
/*
* Error handling within the kernel for continue is subtly different
* from copy or zeropage, so it may be a source of bugs. Trigger an
* error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
*/
req.mapped = 0;
ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
if (ret >= 0 || req.mapped != -EEXIST)
err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
ret, (int64_t) req.mapped);
}
int uffd_read_msg(int ufd, struct uffd_msg *msg)
{
int ret = read(uffd, msg, sizeof(*msg));
if (ret != sizeof(*msg)) {
if (ret < 0) {
if (errno == EAGAIN || errno == EINTR)
return 1;
err("blocking read error");
} else {
err("short read");
}
}
return 0;
}
void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
{
unsigned long offset;
if (msg->event != UFFD_EVENT_PAGEFAULT)
err("unexpected msg event %u", msg->event);
if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
/* Write protect page faults */
wp_range(uffd, msg->arg.pagefault.address, page_size, false);
args->wp_faults++;
} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
uint8_t *area;
int b;
/*
* Minor page faults
*
* To prove we can modify the original range for testing
* purposes, we're going to bit flip this range before
* continuing.
*
* Note that this requires all minor page fault tests operate on
* area_dst (non-UFFD-registered) and area_dst_alias
* (UFFD-registered).
*/
area = (uint8_t *)(area_dst +
((char *)msg->arg.pagefault.address -
area_dst_alias));
for (b = 0; b < page_size; ++b)
area[b] = ~area[b];
continue_range(uffd, msg->arg.pagefault.address, page_size,
args->apply_wp);
args->minor_faults++;
} else {
/*
* Missing page faults.
*
* Here we force a write check for each of the missing mode
* faults. It's guaranteed because the only threads that
* will trigger uffd faults are the locking threads, and
* their first instruction to touch the missing page will
* always be pthread_mutex_lock().
*
* Note that here we relied on an NPTL glibc impl detail to
* always read the lock type at the entry of the lock op
* (pthread_mutex_t.__data.__type, offset 0x10) before
* doing any locking operations to guarantee that. It's
* actually not good to rely on this impl detail because
* logically a pthread-compatible lib can implement the
* locks without types and we can fail when linking with
* them. However since we used to find bugs with this
* strict check we still keep it around. Hopefully this
* could be a good hint when it fails again. If one day
* it'll break on some other impl of glibc we'll revisit.
*/
if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
err("unexpected write fault");
offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
offset &= ~(page_size-1);
if (copy_page(uffd, offset, args->apply_wp))
args->missing_faults++;
}
}
void *uffd_poll_thread(void *arg)
{
struct uffd_args *args = (struct uffd_args *)arg;
unsigned long cpu = args->cpu;
struct pollfd pollfd[2];
struct uffd_msg msg;
struct uffdio_register uffd_reg;
int ret;
char tmp_chr;
if (!args->handle_fault)
args->handle_fault = uffd_handle_page_fault;
pollfd[0].fd = uffd;
pollfd[0].events = POLLIN;
pollfd[1].fd = pipefd[cpu*2];
pollfd[1].events = POLLIN;
for (;;) {
ret = poll(pollfd, 2, -1);
if (ret <= 0) {
if (errno == EINTR || errno == EAGAIN)
continue;
err("poll error: %d", ret);
}
if (pollfd[1].revents) {
if (!(pollfd[1].revents & POLLIN))
err("pollfd[1].revents %d", pollfd[1].revents);
if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
err("read pipefd error");
break;
}
if (!(pollfd[0].revents & POLLIN))
err("pollfd[0].revents %d", pollfd[0].revents);
if (uffd_read_msg(uffd, &msg))
continue;
switch (msg.event) {
default:
err("unexpected msg event %u\n", msg.event);
break;
case UFFD_EVENT_PAGEFAULT:
args->handle_fault(&msg, args);
break;
case UFFD_EVENT_FORK:
close(uffd);
uffd = msg.arg.fork.ufd;
pollfd[0].fd = uffd;
break;
case UFFD_EVENT_REMOVE:
uffd_reg.range.start = msg.arg.remove.start;
uffd_reg.range.len = msg.arg.remove.end -
msg.arg.remove.start;
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
err("remove failure");
break;
case UFFD_EVENT_REMAP:
area_remap = area_dst; /* save for later unmap */
area_dst = (char *)(unsigned long)msg.arg.remap.to;
break;
}
}
return NULL;
}
static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
unsigned long offset)
{
uffd_test_ops->alias_mapping(&uffdio_copy->dst,
uffdio_copy->len,
offset);
if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
/* real retval in ufdio_copy.copy */
if (uffdio_copy->copy != -EEXIST)
err("UFFDIO_COPY retry error: %"PRId64,
(int64_t)uffdio_copy->copy);
} else {
err("UFFDIO_COPY retry unexpected: %"PRId64,
(int64_t)uffdio_copy->copy);
}
}
static void wake_range(int ufd, unsigned long addr, unsigned long len)
{
struct uffdio_range uffdio_wake;
uffdio_wake.start = addr;
uffdio_wake.len = len;
if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
fprintf(stderr, "error waking %lu\n",
addr), exit(1);
}
int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
{
struct uffdio_copy uffdio_copy;
if (offset >= nr_pages * page_size)
err("unexpected offset %lu\n", offset);
uffdio_copy.dst = (unsigned long) area_dst + offset;
uffdio_copy.src = (unsigned long) area_src + offset;
uffdio_copy.len = page_size;
if (wp)
uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
else
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
/* real retval in ufdio_copy.copy */
if (uffdio_copy.copy != -EEXIST)
err("UFFDIO_COPY error: %"PRId64,
(int64_t)uffdio_copy.copy);
wake_range(ufd, uffdio_copy.dst, page_size);
} else if (uffdio_copy.copy != page_size) {
err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
} else {
if (test_uffdio_copy_eexist && retry) {
test_uffdio_copy_eexist = false;
retry_copy_page(ufd, &uffdio_copy, offset);
}
return 1;
}
return 0;
}
int copy_page(int ufd, unsigned long offset, bool wp)
{
return __copy_page(ufd, offset, false, wp);
}
int uffd_open_dev(unsigned int flags)
{
int fd, uffd;
fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
if (fd < 0)
return fd;
uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
close(fd);
return uffd;
}
int uffd_open_sys(unsigned int flags)
{
#ifdef __NR_userfaultfd
return syscall(__NR_userfaultfd, flags);
#else
return -1;
#endif
}
int uffd_open(unsigned int flags)
{
int uffd = uffd_open_sys(flags);
if (uffd < 0)
uffd = uffd_open_dev(flags);
return uffd;
}
int uffd_get_features(uint64_t *features)
{
struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
/*
* This should by default work in most kernels; the feature list
* will be the same no matter what we pass in here.
*/
int fd = uffd_open(UFFD_USER_MODE_ONLY);
if (fd < 0)
/* Maybe the kernel is older than user-only mode? */
fd = uffd_open(0);
if (fd < 0)
return fd;
if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
close(fd);
return -errno;
}
*features = uffdio_api.features;
close(fd);
return 0;
}