lib: add support for device coherent type in test_hmm

Device Coherent type uses device memory that is coherently accesible by
the CPU.  This could be shown as SP (special purpose) memory range at the
BIOS-e820 memory enumeration.  If no SP memory is supported in system,
this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
0x140000000 physical address. Ex.
efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000

Private and coherent device mirror instances can be created in the same
probed.  This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1.  In this case, it will create four instances of
device_mirror.  The first two correspond to private device type, the last
two to coherent type.  Then, they can be easily accessed from user space
through /dev/hmm_mirror<num_device>.  Usually num_device 0 and 1 are for
private, and 2 and 3 for coherent types.  If no module parameters are
passed, two instances of private type device_mirror will be created only.

Link: https://lkml.kernel.org/r/20220715150521.18165-11-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Poppple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Alex Sierra 2022-07-15 10:05:17 -05:00 committed by akpm
parent 25b80162d5
commit 4c2e0f764e
2 changed files with 196 additions and 61 deletions

View File

@ -32,11 +32,22 @@
#include "test_hmm_uapi.h"
#define DMIRROR_NDEVICES 2
#define DMIRROR_NDEVICES 4
#define DMIRROR_RANGE_FAULT_TIMEOUT 1000
#define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
#define DEVMEM_CHUNKS_RESERVE 16
/*
* For device_private pages, dpage is just a dummy struct page
* representing a piece of device memory. dmirror_devmem_alloc_page
* allocates a real system memory page as backing storage to fake a
* real device. zone_device_data points to that backing page. But
* for device_coherent memory, the struct page represents real
* physical CPU-accessible memory that we can use directly.
*/
#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
(page)->zone_device_data : (page))
static unsigned long spm_addr_dev0;
module_param(spm_addr_dev0, long, 0644);
MODULE_PARM_DESC(spm_addr_dev0,
@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
return 0;
}
static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
{
return (mdevice->zone_device_type ==
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
}
static enum migrate_vma_direction
dmirror_select_device(struct dmirror *dmirror)
{
return (dmirror->mdevice->zone_device_type ==
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
MIGRATE_VMA_SELECT_DEVICE_COHERENT;
}
static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
{
vfree(bounce->ptr);
@ -575,16 +601,19 @@ err_devmem:
static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
{
struct page *dpage = NULL;
struct page *rpage;
struct page *rpage = NULL;
/*
* This is a fake device so we alloc real system memory to store
* our device memory.
* For ZONE_DEVICE private type, this is a fake device so we allocate
* real system memory to store our device memory.
* For ZONE_DEVICE coherent type we use the actual dpage to store the
* data and ignore rpage.
*/
rpage = alloc_page(GFP_HIGHUSER);
if (!rpage)
return NULL;
if (dmirror_is_private_zone(mdevice)) {
rpage = alloc_page(GFP_HIGHUSER);
if (!rpage)
return NULL;
}
spin_lock(&mdevice->lock);
if (mdevice->free_pages) {
@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
return dpage;
error:
__free_page(rpage);
if (rpage)
__free_page(rpage);
return NULL;
}
@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
* unallocated pte_none() or read-only zero page.
*/
spage = migrate_pfn_to_page(*src);
if (WARN(spage && is_zone_device_page(spage),
"page already in device spage pfn: 0x%lx\n",
page_to_pfn(spage)))
continue;
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
rpage = dpage->zone_device_data;
rpage = BACKING_PAGE(dpage);
if (spage)
copy_highpage(rpage, spage);
else
@ -648,6 +682,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
*/
rpage->zone_device_data = dmirror;
pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
page_to_pfn(spage), page_to_pfn(dpage));
*dst = migrate_pfn(page_to_pfn(dpage));
if ((*src & MIGRATE_PFN_WRITE) ||
(!spage && args->vma->vm_flags & VM_WRITE))
@ -725,11 +761,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
if (!dpage)
continue;
/*
* Store the page that holds the data so the page table
* doesn't have to deal with ZONE_DEVICE private pages.
*/
entry = dpage->zone_device_data;
entry = BACKING_PAGE(dpage);
if (*dst & MIGRATE_PFN_WRITE)
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
@ -815,15 +847,126 @@ static int dmirror_exclusive(struct dmirror *dmirror,
return ret;
}
static int dmirror_migrate(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
struct dmirror *dmirror)
{
const unsigned long *src = args->src;
unsigned long *dst = args->dst;
unsigned long start = args->start;
unsigned long end = args->end;
unsigned long addr;
for (addr = start; addr < end; addr += PAGE_SIZE,
src++, dst++) {
struct page *dpage, *spage;
spage = migrate_pfn_to_page(*src);
if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
continue;
if (WARN_ON(!is_device_private_page(spage) &&
!is_device_coherent_page(spage)))
continue;
spage = BACKING_PAGE(spage);
dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
if (!dpage)
continue;
pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
page_to_pfn(spage), page_to_pfn(dpage));
lock_page(dpage);
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
copy_highpage(dpage, spage);
*dst = migrate_pfn(page_to_pfn(dpage));
if (*src & MIGRATE_PFN_WRITE)
*dst |= MIGRATE_PFN_WRITE;
}
return 0;
}
static unsigned long
dmirror_successful_migrated_pages(struct migrate_vma *migrate)
{
unsigned long cpages = 0;
unsigned long i;
for (i = 0; i < migrate->npages; i++) {
if (migrate->src[i] & MIGRATE_PFN_VALID &&
migrate->src[i] & MIGRATE_PFN_MIGRATE)
cpages++;
}
return cpages;
}
static int dmirror_migrate_to_system(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
{
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
unsigned long src_pfns[64];
unsigned long dst_pfns[64];
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct migrate_vma args;
unsigned long next;
int ret;
start = cmd->addr;
end = start + size;
if (end < start)
return -EINVAL;
/* Since the mm is for the mirrored process, get a reference first. */
if (!mmget_not_zero(mm))
return -EINVAL;
cmd->cpages = 0;
mmap_read_lock(mm);
for (addr = start; addr < end; addr = next) {
vma = vma_lookup(mm, addr);
if (!vma || !(vma->vm_flags & VM_READ)) {
ret = -EINVAL;
goto out;
}
next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
if (next > vma->vm_end)
next = vma->vm_end;
args.vma = vma;
args.src = src_pfns;
args.dst = dst_pfns;
args.start = addr;
args.end = next;
args.pgmap_owner = dmirror->mdevice;
args.flags = dmirror_select_device(dmirror);
ret = migrate_vma_setup(&args);
if (ret)
goto out;
pr_debug("Migrating from device mem to sys mem\n");
dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
migrate_vma_pages(&args);
cmd->cpages += dmirror_successful_migrated_pages(&args);
migrate_vma_finalize(&args);
}
out:
mmap_read_unlock(mm);
mmput(mm);
return ret;
}
static int dmirror_migrate_to_device(struct dmirror *dmirror,
struct hmm_dmirror_cmd *cmd)
{
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
struct mm_struct *mm = dmirror->notifier.mm;
struct vm_area_struct *vma;
unsigned long src_pfns[64] = { 0 };
unsigned long dst_pfns[64] = { 0 };
struct dmirror_bounce bounce;
struct migrate_vma args;
unsigned long next;
@ -860,6 +1003,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
if (ret)
goto out;
pr_debug("Migrating from sys mem to device mem\n");
dmirror_migrate_alloc_and_copy(&args, dmirror);
migrate_vma_pages(&args);
dmirror_migrate_finalize_and_map(&args, dmirror);
@ -868,7 +1012,10 @@ static int dmirror_migrate(struct dmirror *dmirror,
mmap_read_unlock(mm);
mmput(mm);
/* Return the migrated data for verification. */
/*
* Return the migrated data for verification.
* Only for pages in device zone
*/
ret = dmirror_bounce_init(&bounce, start, size);
if (ret)
return ret;
@ -911,6 +1058,12 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
else
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
} else if (is_device_coherent_page(page)) {
/* Is the page migrated to this device or some other? */
if (dmirror->mdevice == dmirror_page_to_device(page))
*perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
else
*perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
} else if (is_zero_pfn(page_to_pfn(page)))
*perm = HMM_DMIRROR_PROT_ZERO;
else
@ -1098,8 +1251,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_write(dmirror, &cmd);
break;
case HMM_DMIRROR_MIGRATE:
ret = dmirror_migrate(dmirror, &cmd);
case HMM_DMIRROR_MIGRATE_TO_DEV:
ret = dmirror_migrate_to_device(dmirror, &cmd);
break;
case HMM_DMIRROR_MIGRATE_TO_SYS:
ret = dmirror_migrate_to_system(dmirror, &cmd);
break;
case HMM_DMIRROR_EXCLUSIVE:
@ -1161,14 +1318,13 @@ static const struct file_operations dmirror_fops = {
static void dmirror_devmem_free(struct page *page)
{
struct page *rpage = page->zone_device_data;
struct page *rpage = BACKING_PAGE(page);
struct dmirror_device *mdevice;
if (rpage)
if (rpage != page)
__free_page(rpage);
mdevice = dmirror_page_to_device(page);
spin_lock(&mdevice->lock);
mdevice->cfree++;
page->zone_device_data = mdevice->free_pages;
@ -1176,43 +1332,11 @@ static void dmirror_devmem_free(struct page *page)
spin_unlock(&mdevice->lock);
}
static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
struct dmirror *dmirror)
{
const unsigned long *src = args->src;
unsigned long *dst = args->dst;
unsigned long start = args->start;
unsigned long end = args->end;
unsigned long addr;
for (addr = start; addr < end; addr += PAGE_SIZE,
src++, dst++) {
struct page *dpage, *spage;
spage = migrate_pfn_to_page(*src);
if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
continue;
spage = spage->zone_device_data;
dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
if (!dpage)
continue;
lock_page(dpage);
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
copy_highpage(dpage, spage);
*dst = migrate_pfn(page_to_pfn(dpage));
if (*src & MIGRATE_PFN_WRITE)
*dst |= MIGRATE_PFN_WRITE;
}
return 0;
}
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
struct migrate_vma args;
unsigned long src_pfns;
unsigned long dst_pfns;
unsigned long src_pfns = 0;
unsigned long dst_pfns = 0;
struct page *rpage;
struct dmirror *dmirror;
vm_fault_t ret;
@ -1232,7 +1356,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
args.src = &src_pfns;
args.dst = &dst_pfns;
args.pgmap_owner = dmirror->mdevice;
args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
args.flags = dmirror_select_device(dmirror);
if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS;
@ -1311,6 +1435,12 @@ static int __init hmm_dmirror_init(void)
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
dmirror_devices[ndevices++].zone_device_type =
HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
if (spm_addr_dev0 && spm_addr_dev1) {
dmirror_devices[ndevices++].zone_device_type =
HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
dmirror_devices[ndevices++].zone_device_type =
HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
}
for (id = 0; id < ndevices; id++) {
ret = dmirror_device_init(dmirror_devices + id, id);
if (ret)
@ -1333,7 +1463,8 @@ static void __exit hmm_dmirror_exit(void)
int id;
for (id = 0; id < DMIRROR_NDEVICES; id++)
dmirror_device_remove(dmirror_devices + id);
if (dmirror_devices[id].zone_device_type)
dmirror_device_remove(dmirror_devices + id);
unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
}

View File

@ -50,6 +50,8 @@ struct hmm_dmirror_cmd {
* device the ioctl() is made
* HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
* other device
* HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
* the ioctl() is made
*/
enum {
HMM_DMIRROR_PROT_ERROR = 0xFF,
@ -61,6 +63,8 @@ enum {
HMM_DMIRROR_PROT_ZERO = 0x10,
HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40,
HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50,
};
enum {