xen: lock pte pages while pinning/unpinning

When a pagetable is created, it is made globally visible in the rmap
prio tree before it is pinned via arch_dup_mmap(), and remains in the
rmap tree while it is unpinned with arch_exit_mmap().

This means that other CPUs may race with the pinning/unpinning
process, and see a pte between when it gets marked RO and actually
pinned, causing any pte updates to fail with write-protect faults.

As a result, all pte pages must be properly locked, and only unlocked
once the pinning/unpinning process has finished.

In order to avoid taking spinlocks for the whole pagetable - which may
overflow the PREEMPT_BITS portion of preempt counter - it locks and pins
each pte page individually, and then finally pins the whole pagetable.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickens <hugh@veritas.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Keir Fraser <keir@xensource.com>
Cc: Jan Beulich <jbeulich@novell.com>
This commit is contained in:
Jeremy Fitzhardinge 2007-10-16 11:51:30 -07:00 committed by Jeremy Fitzhardinge
parent 9f79991d41
commit 74260714c5
3 changed files with 103 additions and 41 deletions

View file

@ -666,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
} }
static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = level;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
/* This needs to make sure the new pte page is pinned iff its being /* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */ attached to a pinned pagetable. */
static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@ -675,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
if (PagePinned(virt_to_page(mm->pgd))) { if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page); SetPagePinned(page);
if (!PageHighMem(page)) if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
else pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
} else
/* make sure there are no stray mappings of /* make sure there are no stray mappings of
this page */ this page */
kmap_flush_unused(); kmap_flush_unused();
@ -690,8 +700,10 @@ static void xen_release_pt(u32 pfn)
struct page *page = pfn_to_page(pfn); struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) { if (PagePinned(page)) {
if (!PageHighMem(page)) if (!PageHighMem(page)) {
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
} }
} }
@ -806,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned /* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */ yet because the page structures don't exist yet. */
{ {
struct mmuext_op op; unsigned level;
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
op.cmd = MMUEXT_PIN_L3_TABLE; level = MMUEXT_PIN_L3_TABLE;
#else #else
op.cmd = MMUEXT_PIN_L3_TABLE; level = MMUEXT_PIN_L2_TABLE;
#endif #endif
op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
BUG();
} }
} }

View file

@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
} }
#endif /* CONFIG_X86_PAE */ #endif /* CONFIG_X86_PAE */
enum pt_level {
PT_PGD,
PT_PUD,
PT_PMD,
PT_PTE
};
/* /*
(Yet another) pagetable walker. This one is intended for pinning a (Yet another) pagetable walker. This one is intended for pinning a
@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
FIXADDR_TOP. But the important bit is that we don't pin beyond FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes. there, because then we start getting into Xen's ptes.
*/ */
static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
unsigned long limit) unsigned long limit)
{ {
pgd_t *pgd = pgd_base; pgd_t *pgd = pgd_base;
@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
pud = pud_offset(pgd, 0); pud = pud_offset(pgd, 0);
if (PTRS_PER_PUD > 1) /* not folded */ if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), 0); flush |= (*func)(virt_to_page(pud), PT_PUD);
for (; addr != pud_limit; pud++, addr = pud_next) { for (; addr != pud_limit; pud++, addr = pud_next) {
pmd_t *pmd; pmd_t *pmd;
@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
pmd = pmd_offset(pud, 0); pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */ if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), 0); flush |= (*func)(virt_to_page(pmd), PT_PMD);
for (; addr != pmd_limit; pmd++) { for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE); addr += (PAGE_SIZE * PTRS_PER_PTE);
@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
if (pmd_none(*pmd)) if (pmd_none(*pmd))
continue; continue;
flush |= (*func)(pmd_page(*pmd), 0); flush |= (*func)(pmd_page(*pmd), PT_PTE);
} }
} }
} }
flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
return flush; return flush;
} }
static int pin_page(struct page *page, unsigned flags) static spinlock_t *lock_pte(struct page *page)
{
spinlock_t *ptl = NULL;
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
ptl = __pte_lockptr(page);
spin_lock(ptl);
#endif
return ptl;
}
static void do_unlock(void *v)
{
spinlock_t *ptl = v;
spin_unlock(ptl);
}
static void xen_do_pin(unsigned level, unsigned long pfn)
{
struct mmuext_op *op;
struct multicall_space mcs;
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = level;
op->arg1.mfn = pfn_to_mfn(pfn);
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
}
static int pin_page(struct page *page, enum pt_level level)
{ {
unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
int flush; int flush;
@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
void *pt = lowmem_page_address(page); void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page); unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0); struct multicall_space mcs = __xen_mc_entry(0);
spinlock_t *ptl;
flush = 0; flush = 0;
ptl = NULL;
if (level == PT_PTE)
ptl = lock_pte(page);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO), pfn_pte(pfn, PAGE_KERNEL_RO),
flags); level == PT_PGD ? UVMF_TLB_FLUSH : 0);
if (level == PT_PTE)
xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
if (ptl) {
/* Queue a deferred unlock for when this batch
is completed. */
xen_mc_callback(do_unlock, ptl);
}
} }
return flush; return flush;
@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
read-only, and can be pinned. */ read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd) void xen_pgd_pin(pgd_t *pgd)
{ {
struct multicall_space mcs; unsigned level;
struct mmuext_op *op;
xen_mc_batch(); xen_mc_batch();
@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch(); xen_mc_batch();
} }
mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
op->cmd = MMUEXT_PIN_L3_TABLE; level = MMUEXT_PIN_L3_TABLE;
#else #else
op->cmd = MMUEXT_PIN_L2_TABLE; level = MMUEXT_PIN_L2_TABLE;
#endif #endif
op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_do_pin(level, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0); xen_mc_issue(0);
} }
@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
/* The init_mm pagetable is really pinned as soon as its created, but /* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all that's before we have page structures to store the bits. So do all
the book-keeping now. */ the book-keeping now. */
static __init int mark_pinned(struct page *page, unsigned flags) static __init int mark_pinned(struct page *page, enum pt_level level)
{ {
SetPagePinned(page); SetPagePinned(page);
return 0; return 0;
@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
} }
static int unpin_page(struct page *page, unsigned flags) static int unpin_page(struct page *page, enum pt_level level)
{ {
unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
if (pgfl && !PageHighMem(page)) { if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page); void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page); unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0); spinlock_t *ptl = NULL;
struct multicall_space mcs;
if (level == PT_PTE) {
ptl = lock_pte(page);
xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
}
mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL), pfn_pte(pfn, PAGE_KERNEL),
flags); level == PT_PGD ? UVMF_TLB_FLUSH : 0);
if (ptl) {
/* unlock when batch completed */
xen_mc_callback(do_unlock, ptl);
}
} }
return 0; /* never need to flush on unpin */ return 0; /* never need to flush on unpin */
@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
/* Release a pagetables pages back as normal RW */ /* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd) static void xen_pgd_unpin(pgd_t *pgd)
{ {
struct mmuext_op *op;
struct multicall_space mcs;
xen_mc_batch(); xen_mc_batch();
mcs = __xen_mc_entry(sizeof(*op)); xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
op = mcs.args;
op->cmd = MMUEXT_UNPIN_TABLE;
op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
pgd_walk(pgd, unpin_page, TASK_SIZE); pgd_walk(pgd, unpin_page, TASK_SIZE);
@ -585,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
/* pgd may not be pinned in the error exit path of execve */ /* pgd may not be pinned in the error exit path of execve */
if (PagePinned(virt_to_page(mm->pgd))) if (PagePinned(virt_to_page(mm->pgd)))
xen_pgd_unpin(mm->pgd); xen_pgd_unpin(mm->pgd);
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
} }

View file

@ -137,7 +137,6 @@ config SPLIT_PTLOCK_CPUS
int int
default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20 default "4096" if PARISC && !PA20
default "4096" if XEN
default "4" default "4"
# #