From f4fccac05f7f6bacb8e481a84d175e85ffcf9fe2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:24:59 +0100 Subject: [PATCH 01/24] x86/efi: Simplify EFI_DEBUG ... and lose one #ifdef .. #endif sandwich. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1d3372ac9c66..f396163b0402 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -51,7 +51,7 @@ #include #include -#define EFI_DEBUG 1 +#define EFI_DEBUG #define EFI_MIN_RESERVE 5120 @@ -398,9 +398,9 @@ int __init efi_memblock_x86_reserve_range(void) return 0; } -#if EFI_DEBUG static void __init print_efi_memmap(void) { +#ifdef EFI_DEBUG efi_memory_desc_t *md; void *p; int i; @@ -415,8 +415,8 @@ static void __init print_efi_memmap(void) md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -} #endif /* EFI_DEBUG */ +} void __init efi_reserve_boot_services(void) { @@ -696,10 +696,7 @@ void __init efi_init(void) x86_platform.set_wallclock = efi_set_rtc_mmss; } #endif - -#if EFI_DEBUG print_efi_memmap(); -#endif } void __init efi_late_init(void) From 0fd64c23fdf556e9e68580cff03b3505797bbf53 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:00 +0100 Subject: [PATCH 02/24] x86/mm/pageattr: Lookup address in an arbitrary PGD This is preparatory work in order to be able to map pages into a specified PGD and not implicitly and only into init_mm. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bb32480c2d71..c53de62a1170 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -30,6 +30,7 @@ */ struct cpa_data { unsigned long *vaddr; + pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; int numpages; @@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, return prot; } -/* - * Lookup the page table entry for a virtual address. Return a pointer - * to the entry and the level of the mapping. - * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. - */ -pte_t *lookup_address(unsigned long address, unsigned int *level) +static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) { - pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; @@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) return pte_offset_kernel(pmd, address); } + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + return __lookup_address_in_pgd(pgd_offset_k(address), address, level); +} EXPORT_SYMBOL_GPL(lookup_address); +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, + unsigned int *level) +{ + if (cpa->pgd) + return __lookup_address_in_pgd(cpa->pgd + pgd_index(address), + address, level); + + return lookup_address(address, level); +} + /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() From f3f729661e8db476ac427a97de015307aebb7404 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:01 +0100 Subject: [PATCH 03/24] x86/mm/pageattr: Add a PGD pagetable populating function This allocates, if necessary, and populates the corresponding PGD entry with a PUD page. The next population level is a dummy macro which will be removed by the next patch and it is added here to keep the patch small and easily reviewable but not break bisection, at the same time. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c53de62a1170..4b47ae0602e1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,6 +666,45 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } +#define populate_pud(cpa, addr, pgd, pgprot) (-1) + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ + pgprot_t pgprot = __pgprot(_KERNPG_TABLE); + bool allocd_pgd = false; + pgd_t *pgd_entry; + pud_t *pud = NULL; /* shut up gcc */ + int ret; + + pgd_entry = cpa->pgd + pgd_index(addr); + + /* + * Allocate a PUD page and hand it down for mapping. + */ + if (pgd_none(*pgd_entry)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pud) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + allocd_pgd = true; + } + + pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); + + ret = populate_pud(cpa, addr, pgd_entry, pgprot); + if (ret < 0) + return ret; + + cpa->numpages = ret; + return 0; +} + static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { From 4b23538d88c87d9c693ad87c8c808e92a505a6e6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:02 +0100 Subject: [PATCH 04/24] x86/mm/pageattr: Add a PUD pagetable populating function Add the next level of the pagetable populating function, we handle chunks around a 1G boundary by mapping them with the lower level functions - otherwise we use 1G pages for the mappings, thus using as less amount of pagetable pages as possible. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 87 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4b47ae0602e1..81deca77b871 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,7 +666,92 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } -#define populate_pud(cpa, addr, pgd, pgprot) (-1) +static int alloc_pmd_page(pud_t *pud) +{ + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pmd) + return -1; + + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + return 0; +} + +#define populate_pmd(cpa, start, end, pages, pud, pgprot) (-1) + +static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) +{ + pud_t *pud; + unsigned long end; + int cur_pages = 0; + + end = start + (cpa->numpages << PAGE_SHIFT); + + /* + * Not on a Gb page boundary? => map everything up to it with + * smaller pages. + */ + if (start & (PUD_SIZE - 1)) { + unsigned long pre_end; + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + + pre_end = min_t(unsigned long, end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + + pud = pud_offset(pgd, start); + + /* + * Need a PMD page? + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, + pud, pgprot); + if (cur_pages < 0) + return cur_pages; + + start = pre_end; + } + + /* We mapped them all? */ + if (cpa->numpages == cur_pages) + return cur_pages; + + pud = pud_offset(pgd, start); + + /* + * Map everything starting from the Gb boundary, possibly with 1G pages + */ + while (end - start >= PUD_SIZE) { + set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + + start += PUD_SIZE; + cpa->pfn += PUD_SIZE; + cur_pages += PUD_SIZE >> PAGE_SHIFT; + pud++; + } + + /* Map trailing leftover */ + if (start < end) { + int tmp; + + pud = pud_offset(pgd, start); + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, + pud, pgprot); + if (tmp < 0) + return cur_pages; + + cur_pages += tmp; + } + return cur_pages; +} /* * Restrictions for kernel page table do not necessarily apply when mapping in From f900a4b8ab0f462d89a9fcb6173cac1403415b16 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:03 +0100 Subject: [PATCH 05/24] x86/mm/pageattr: Add a PMD pagetable populating function Handle PMD-level mappings the same as PUD ones. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 82 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 81deca77b871..968398b023c0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,6 +666,16 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } +static int alloc_pte_page(pmd_t *pmd) +{ + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!pte) + return -1; + + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + return 0; +} + static int alloc_pmd_page(pud_t *pud) { pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); @@ -676,7 +686,77 @@ static int alloc_pmd_page(pud_t *pud) return 0; } -#define populate_pmd(cpa, start, end, pages, pud, pgprot) (-1) +#define populate_pte(cpa, start, end, pages, pmd, pgprot) do {} while (0) + +static int populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ + unsigned int cur_pages = 0; + pmd_t *pmd; + + /* + * Not on a 2M boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long pre_end = start + (num_pages << PAGE_SHIFT); + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + + pre_end = min_t(unsigned long, pre_end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(unsigned int, num_pages, cur_pages); + + /* + * Need a PTE page? + */ + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + + start = pre_end; + } + + /* + * We mapped them all? + */ + if (num_pages == cur_pages) + return cur_pages; + + while (end - start >= PMD_SIZE) { + + /* + * We cannot use a 1G page so allocate a PMD page if needed. + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + pmd = pmd_offset(pud, start); + + set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot))); + + start += PMD_SIZE; + cpa->pfn += PMD_SIZE; + cur_pages += PMD_SIZE >> PAGE_SHIFT; + } + + /* + * Map trailing 4K pages. + */ + if (start < end) { + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, end, num_pages - cur_pages, + pmd, pgprot); + } + return num_pages; +} static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, pgprot_t pgprot) From c6b6f363f7b24aa448994e3a65c4d5b3116acfcc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:04 +0100 Subject: [PATCH 06/24] x86/mm/pageattr: Add a PTE pagetable populating function Handle last level by unconditionally writing the PTEs into the PTE page while paying attention to the NX bit. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 968398b023c0..2a1308a8c072 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -686,7 +686,27 @@ static int alloc_pmd_page(pud_t *pud) return 0; } -#define populate_pte(cpa, start, end, pages, pmd, pgprot) do {} while (0) +static void populate_pte(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, start); + + while (num_pages-- && start < end) { + + /* deal with the NX bit */ + if (!(pgprot_val(pgprot) & _PAGE_NX)) + cpa->pfn &= ~_PAGE_NX; + + set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); + + start += PAGE_SIZE; + cpa->pfn += PAGE_SIZE; + pte++; + } +} static int populate_pmd(struct cpa_data *cpa, unsigned long start, unsigned long end, From 0bb8aeee7b73b21e09d3ea12f2120d974f70b669 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:05 +0100 Subject: [PATCH 07/24] x86/mm/pageattr: Add a PUD error unwinding path In case we encounter an error during the mapping of a region, we want to unwind what we've established so far exactly the way we did the mapping. This is the PUD part kept deliberately small for easier review. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 60 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2a1308a8c072..1cbdbbc35b47 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,6 +666,51 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } +#define unmap_pmd_range(pud, start, pre_end) do {} while (0) + +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ + pud_t *pud = pud_offset(pgd, start); + + /* + * Not on a GB page boundary? + */ + if (start & (PUD_SIZE - 1)) { + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + unmap_pmd_range(pud, start, pre_end); + + start = pre_end; + pud++; + } + + /* + * Try to unmap in 1G chunks? + */ + while (end - start >= PUD_SIZE) { + + if (pud_large(*pud)) + pud_clear(pud); + else + unmap_pmd_range(pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; + } + + /* + * 2M leftovers? + */ + if (start < end) + unmap_pmd_range(pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in + * populate_pgd's error path + */ +} + static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); @@ -883,9 +928,20 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); ret = populate_pud(cpa, addr, pgd_entry, pgprot); - if (ret < 0) - return ret; + if (ret < 0) { + unmap_pud_range(pgd_entry, addr, + addr + (cpa->numpages << PAGE_SHIFT)); + if (allocd_pgd) { + /* + * If I allocated this PUD page, I can just as well + * free it in this error path. + */ + pgd_clear(pgd_entry); + free_page((unsigned long)pud); + } + return ret; + } cpa->numpages = ret; return 0; } From 52a628fb454d13d944bb3c8a89a314cc3affa417 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:06 +0100 Subject: [PATCH 08/24] x86/mm/pageattr: Add last levels of error path We try to free the pagetable pages once we've unmapped our portion. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 94 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1cbdbbc35b47..db8ace29514f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -666,7 +666,99 @@ static int split_large_page(pte_t *kpte, unsigned long address) return 0; } -#define unmap_pmd_range(pud, start, pre_end) do {} while (0) +static bool try_to_free_pte_page(pte_t *pte) +{ + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; + + free_page((unsigned long)pte); + return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; + + free_page((unsigned long)pmd); + return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, start); + + while (start < end) { + set_pte(pte, __pte(0)); + + start += PAGE_SIZE; + pte++; + } + + if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + if (unmap_pte_range(pmd, start, end)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, start); + + /* + * Not on a 2MB page boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + __unmap_pmd_range(pud, pmd, start, pre_end); + + start = pre_end; + pmd++; + } + + /* + * Try to unmap in 2M chunks. + */ + while (end - start >= PMD_SIZE) { + if (pmd_large(*pmd)) + pmd_clear(pmd); + else + __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; + } + + /* + * 4K leftovers? + */ + if (start < end) + return __unmap_pmd_range(pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) { From 82f0712ca0f947170e785300b5c39d9c25e2f6ff Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:07 +0100 Subject: [PATCH 09/24] x86/mm/cpa: Map in an arbitrary pgd Add the ability to map pages in an arbitrary pgd. This wires in the remaining stuff so that there's a new interface with which you can map a region into an arbitrary PGD. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/mm/pageattr.c | 53 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index db8ace29514f..b3b19f46c016 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -453,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Check for races, another CPU might have split this page * up already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) goto out_unlock; @@ -559,7 +559,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, } static int -__split_large_page(pte_t *kpte, unsigned long address, struct page *base) +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, + struct page *base) { pte_t *pbase = (pte_t *)page_address(base); unsigned long pfn, pfninc = 1; @@ -572,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base) * Check for races, another CPU might have split this page * up for us already: */ - tmp = lookup_address(address, &level); + tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; @@ -648,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base) return 0; } -static int split_large_page(pte_t *kpte, unsigned long address) +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + unsigned long address) { struct page *base; @@ -660,7 +662,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (!base) return -ENOMEM; - if (__split_large_page(kpte, address, base)) + if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; @@ -1041,6 +1043,9 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { + if (cpa->pgd) + return populate_pgd(cpa, vaddr); + /* * Ignore all non primary paths. */ @@ -1085,7 +1090,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) else address = *cpa->vaddr; repeat: - kpte = lookup_address(address, &level); + kpte = _lookup_address_cpa(cpa, address, &level); if (!kpte) return __cpa_process_fault(cpa, address, primary); @@ -1149,7 +1154,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) /* * We have to split the large page: */ - err = split_large_page(kpte, address); + err = split_large_page(cpa, kpte, address); if (!err) { /* * Do a global flush tlb after splitting the large page @@ -1298,6 +1303,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, int ret, cache, checkalias; unsigned long baddr = 0; + memset(&cpa, 0, sizeof(cpa)); + /* * Check, if we are requested to change a not supported * feature: @@ -1744,6 +1751,7 @@ static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), @@ -1762,6 +1770,7 @@ static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), @@ -1822,6 +1831,36 @@ bool kernel_page_present(struct page *page) #endif /* CONFIG_DEBUG_PAGEALLOC */ +int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags) +{ + int retval = -EINVAL; + + struct cpa_data cpa = { + .vaddr = &address, + .pfn = pfn, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(0), + .flags = 0, + }; + + if (!(__supported_pte_mask & _PAGE_NX)) + goto out; + + if (!(page_flags & _PAGE_NX)) + cpa.mask_clr = __pgprot(_PAGE_NX); + + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + + retval = __change_page_attr_set_clr(&cpa, 0); + __flush_tlb_all(); + +out: + return retval; +} + /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. From d2f7cbe7b26a74dbbbf8f325b2a6fd01bc34032c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:08 +0100 Subject: [PATCH 10/24] x86/efi: Runtime services virtual mapping We map the EFI regions needed for runtime services non-contiguously, with preserved alignment on virtual addresses starting from -4G down for a total max space of 64G. This way, we provide for stable runtime services addresses across kernels so that a kexec'd kernel can still use them. Thus, they're mapped in a separate pagetable so that we don't pollute the kernel namespace. Add an efi= kernel command line parameter for passing miscellaneous options and chicken bits from the command line. While at it, add a chicken bit called "efi=old_map" which can be used as a fallback to the old runtime services mapping method in case there's some b0rkage with a particular EFI implementation (haha, it is hard to hold up the sarcasm here...). Also, add the UEFI RT VA space to Documentation/x86/x86_64/mm.txt. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 6 ++ Documentation/x86/x86_64/mm.txt | 7 ++ arch/x86/include/asm/efi.h | 64 +++++++++++----- arch/x86/include/asm/pgtable_types.h | 3 +- arch/x86/platform/efi/efi.c | 94 ++++++++++++++++------- arch/x86/platform/efi/efi_32.c | 9 ++- arch/x86/platform/efi/efi_64.c | 109 +++++++++++++++++++++++++++ arch/x86/platform/efi/efi_stub_64.S | 54 +++++++++++++ include/linux/efi.h | 1 + 9 files changed, 300 insertions(+), 47 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7a0f202d482e..ed43e92b0e7e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -835,6 +835,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. edd= [EDD] Format: {"off" | "on" | "skip[mbr]"} + efi= [EFI] + Format: { "old_map" } + old_map [X86-64]: switch to the old ioremap-based EFI + runtime services mapping. 32-bit still uses this one by + default. + efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of your efi variable storage. Use this parameter only if diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 881582f75c9c..c584a51add15 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -28,4 +28,11 @@ reference. Current X86-64 implementations only support 40 bits of address space, but we support up to 46 bits. This expands into MBZ space in the page tables. +->trampoline_pgd: + +We map EFI runtime services in the aforementioned PGD in the virtual +range of 64Gb (arbitrarily set, can be raised if needed) + +0xffffffef00000000 - 0xffffffff00000000 + -Andi Kleen, Jul 2004 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 65c6e6e3a552..89a05b0507b9 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,6 +1,24 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +/* + * We map the EFI regions needed for runtime services non-contiguously, + * with preserved alignment on virtual addresses starting from -4G down + * for a total max space of 64G. This way, we provide for stable runtime + * services addresses across kernels so that a kexec'd kernel can still + * use them. + * + * This is the main reason why we're doing stable VA mappings for RT + * services. + * + * This flag is used in conjuction with a chicken bit called + * "efi=old_map" which can be used as a fallback to the old runtime + * services mapping method in case there's some b0rkage with a + * particular EFI implementation (haha, it is hard to hold up the + * sarcasm here...). + */ +#define EFI_OLD_MEMMAP EFI_ARCH_1 + #ifdef CONFIG_X86_32 #define EFI_LOADER_SIGNATURE "EL32" @@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ (u64)(a4), (u64)(a5), (u64)(a6)) +#define _efi_call_virtX(x, f, ...) \ +({ \ + efi_status_t __s; \ + \ + efi_sync_low_kernel_mappings(); \ + preempt_disable(); \ + __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \ + preempt_enable(); \ + __s; \ +}) + #define efi_call_virt0(f) \ - efi_call0((efi.systab->runtime->f)) -#define efi_call_virt1(f, a1) \ - efi_call1((efi.systab->runtime->f), (u64)(a1)) -#define efi_call_virt2(f, a1, a2) \ - efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2)) -#define efi_call_virt3(f, a1, a2, a3) \ - efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3)) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4)) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5)) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ - (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) + _efi_call_virtX(0, f) +#define efi_call_virt1(f, a1) \ + _efi_call_virtX(1, f, (u64)(a1)) +#define efi_call_virt2(f, a1, a2) \ + _efi_call_virtX(2, f, (u64)(a1), (u64)(a2)) +#define efi_call_virt3(f, a1, a2, a3) \ + _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3)) +#define efi_call_virt4(f, a1, a2, a3, a4) \ + _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4)) +#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ + _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5)) +#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ + _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -95,12 +120,17 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, extern int add_efi_memmap; extern unsigned long x86_efi_facility; +extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_map_region(efi_memory_desc_t *md); +extern void efi_sync_low_kernel_mappings(void); +extern void efi_setup_page_tables(void); +extern void __init old_map_region(efi_memory_desc_t *md); #ifdef CONFIG_EFI diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e031131..028e28b6fc2c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -379,7 +379,8 @@ static inline void update_page_count(int level, unsigned long pages) { } */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern phys_addr_t slow_virt_to_phys(void *__address); - +extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f396163b0402..b453069236fd 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -12,6 +12,8 @@ * Bibo Mao * Chandramouli Narayanan * Huang Ying + * Copyright (C) 2013 SuSE Labs + * Borislav Petkov - runtime services VA mapping * * Copied from efi_32.c to eliminate the duplicated code between EFI * 32/64 support code. --ying 2007-10-26 @@ -745,21 +747,56 @@ void efi_memory_uc(u64 addr, unsigned long size) set_memory_uc(addr, npages); } +void __init old_map_region(efi_memory_desc_t *md) +{ + u64 start_pfn, end_pfn, end; + unsigned long size; + void *va; + + start_pfn = PFN_DOWN(md->phys_addr); + size = md->num_pages << PAGE_SHIFT; + end = md->phys_addr + size; + end_pfn = PFN_UP(end); + + if (pfn_range_is_mapped(start_pfn, end_pfn)) { + va = __va(md->phys_addr); + + if (!(md->attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)va, size); + } else + va = efi_ioremap(md->phys_addr, size, + md->type, md->attribute); + + md->virt_addr = (u64) (unsigned long) va; + if (!va) + pr_err("ioremap of 0x%llX failed!\n", + (unsigned long long)md->phys_addr); +} + /* * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to + * Essentially, we look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor into the + * ->trampoline_pgd page table using a top-down VA allocation scheme. + * + * The old method which used to update that memory descriptor with the + * virtual address obtained from ioremap() is still supported when the + * kernel is booted with efi=old_map on its command line. Same old + * method enabled the runtime services to be called without having to * thunk back into physical mode for every invocation. + * + * The new method does a pagetable switch in a preemption-safe manner + * so that we're in a different address space when calling a runtime + * function. For function arguments passing we do copy the PGDs of the + * kernel page table into ->trampoline_pgd prior to each call. */ void __init efi_enter_virtual_mode(void) { efi_memory_desc_t *md, *prev_md = NULL; - efi_status_t status; + void *p, *new_memmap = NULL; unsigned long size; - u64 end, systab, start_pfn, end_pfn; - void *p, *va, *new_memmap = NULL; + efi_status_t status; + u64 end, systab; int count = 0; efi.systab = NULL; @@ -768,7 +805,6 @@ void __init efi_enter_virtual_mode(void) * We don't do virtual mode, since we don't do runtime services, on * non-native EFI */ - if (!efi_is_native()) { efi_unmap_memmap(); return; @@ -799,6 +835,7 @@ void __init efi_enter_virtual_mode(void) continue; } prev_md = md; + } for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -808,33 +845,18 @@ void __init efi_enter_virtual_mode(void) md->type != EFI_BOOT_SERVICES_DATA) continue; + efi_map_region(md); + size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - start_pfn = PFN_DOWN(md->phys_addr); - end_pfn = PFN_UP(end); - if (pfn_range_is_mapped(start_pfn, end_pfn)) { - va = __va(md->phys_addr); - - if (!(md->attribute & EFI_MEMORY_WB)) - efi_memory_uc((u64)(unsigned long)va, size); - } else - va = efi_ioremap(md->phys_addr, size, - md->type, md->attribute); - - md->virt_addr = (u64) (unsigned long) va; - - if (!va) { - pr_err("ioremap of 0x%llX failed!\n", - (unsigned long long)md->phys_addr); - continue; - } - systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *) (unsigned long) systab; } + new_memmap = krealloc(new_memmap, (count + 1) * memmap.desc_size, GFP_KERNEL); @@ -845,6 +867,9 @@ void __init efi_enter_virtual_mode(void) BUG_ON(!efi.systab); + efi_setup_page_tables(); + efi_sync_low_kernel_mappings(); + status = phys_efi_set_virtual_address_map( memmap.desc_size * count, memmap.desc_size, @@ -877,7 +902,8 @@ void __init efi_enter_virtual_mode(void) efi.query_variable_info = virt_efi_query_variable_info; efi.update_capsule = virt_efi_update_capsule; efi.query_capsule_caps = virt_efi_query_capsule_caps; - if (__supported_pte_mask & _PAGE_NX) + + if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) runtime_code_page_mkexec(); kfree(new_memmap); @@ -1007,3 +1033,15 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size) return EFI_SUCCESS; } EXPORT_SYMBOL_GPL(efi_query_variable_store); + +static int __init parse_efi_cmdline(char *str) +{ + if (*str == '=') + str++; + + if (!strncmp(str, "old_map", 7)) + set_bit(EFI_OLD_MEMMAP, &x86_efi_facility); + + return 0; +} +early_param("efi", parse_efi_cmdline); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 40e446941dd7..e94557cf5487 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -37,9 +37,16 @@ * claim EFI runtime service handler exclusively and to duplicate a memory in * low memory space say 0 - 3G. */ - static unsigned long efi_rt_eflags; +void efi_sync_low_kernel_mappings(void) {} +void efi_setup_page_tables(void) {} + +void __init efi_map_region(efi_memory_desc_t *md) +{ + old_map_region(md); +} + void efi_call_phys_prelog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 39a0e7f1f0a3..bf286c386d33 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -38,10 +38,28 @@ #include #include #include +#include static pgd_t *save_pgd __initdata; static unsigned long efi_flags __initdata; +/* + * We allocate runtime services regions bottom-up, starting from -4G, i.e. + * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. + */ +static u64 efi_va = -4 * (1UL << 30); +#define EFI_VA_END (-68 * (1UL << 30)) + +/* + * Scratch space used for switching the pagetable in the EFI stub + */ +struct efi_scratch { + u64 r15; + u64 prev_cr3; + pgd_t *efi_pgt; + bool use_pgd; +}; + static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; @@ -65,6 +83,9 @@ void __init efi_call_phys_prelog(void) int pgd; int n_pgds; + if (!efi_enabled(EFI_OLD_MEMMAP)) + return; + early_code_mapping_set_exec(1); local_irq_save(efi_flags); @@ -86,6 +107,10 @@ void __init efi_call_phys_epilog(void) */ int pgd; int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); + + if (!efi_enabled(EFI_OLD_MEMMAP)) + return; + for (pgd = 0; pgd < n_pgds; pgd++) set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); kfree(save_pgd); @@ -94,6 +119,90 @@ void __init efi_call_phys_epilog(void) early_code_mapping_set_exec(0); } +/* + * Add low kernel mappings for passing arguments to EFI functions. + */ +void efi_sync_low_kernel_mappings(void) +{ + unsigned num_pgds; + pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); + + if (efi_enabled(EFI_OLD_MEMMAP)) + return; + + num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET); + + memcpy(pgd + pgd_index(PAGE_OFFSET), + init_mm.pgd + pgd_index(PAGE_OFFSET), + sizeof(pgd_t) * num_pgds); +} + +void efi_setup_page_tables(void) +{ + efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd; + + if (!efi_enabled(EFI_OLD_MEMMAP)) + efi_scratch.use_pgd = true; +} + +static void __init __map_region(efi_memory_desc_t *md, u64 va) +{ + pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); + unsigned long pf = 0, size; + u64 end; + + if (!(md->attribute & EFI_MEMORY_WB)) + pf |= _PAGE_PCD; + + size = md->num_pages << PAGE_SHIFT; + end = va + size; + + if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf)) + pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", + md->phys_addr, va); +} + +void __init efi_map_region(efi_memory_desc_t *md) +{ + unsigned long size = md->num_pages << PAGE_SHIFT; + u64 pa = md->phys_addr; + + if (efi_enabled(EFI_OLD_MEMMAP)) + return old_map_region(md); + + /* + * Make sure the 1:1 mappings are present as a catch-all for b0rked + * firmware which doesn't update all internal pointers after switching + * to virtual mode and would otherwise crap on us. + */ + __map_region(md, md->phys_addr); + + efi_va -= size; + + /* Is PA 2M-aligned? */ + if (!(pa & (PMD_SIZE - 1))) { + efi_va &= PMD_MASK; + } else { + u64 pa_offset = pa & (PMD_SIZE - 1); + u64 prev_va = efi_va; + + /* get us the same offset within this 2M page */ + efi_va = (efi_va & PMD_MASK) + pa_offset; + + if (efi_va > prev_va) + efi_va -= PMD_SIZE; + } + + if (efi_va < EFI_VA_END) { + pr_warn(FW_WARN "VA address range overflow!\n"); + return; + } + + /* Do the VA map */ + __map_region(md, efi_va); + md->virt_addr = efi_va; +} + void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, u32 type, u64 attribute) { diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 4c07ccab8146..88073b140298 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -34,10 +34,47 @@ mov %rsi, %cr0; \ mov (%rsp), %rsp + /* stolen from gcc */ + .macro FLUSH_TLB_ALL + movq %r15, efi_scratch(%rip) + movq %r14, efi_scratch+8(%rip) + movq %cr4, %r15 + movq %r15, %r14 + andb $0x7f, %r14b + movq %r14, %cr4 + movq %r15, %cr4 + movq efi_scratch+8(%rip), %r14 + movq efi_scratch(%rip), %r15 + .endm + + .macro SWITCH_PGT + cmpb $0, efi_scratch+24(%rip) + je 1f + movq %r15, efi_scratch(%rip) # r15 + # save previous CR3 + movq %cr3, %r15 + movq %r15, efi_scratch+8(%rip) # prev_cr3 + movq efi_scratch+16(%rip), %r15 # EFI pgt + movq %r15, %cr3 + 1: + .endm + + .macro RESTORE_PGT + cmpb $0, efi_scratch+24(%rip) + je 2f + movq efi_scratch+8(%rip), %r15 + movq %r15, %cr3 + movq efi_scratch(%rip), %r15 + FLUSH_TLB_ALL + 2: + .endm + ENTRY(efi_call0) SAVE_XMM subq $32, %rsp + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -47,7 +84,9 @@ ENTRY(efi_call1) SAVE_XMM subq $32, %rsp mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -57,7 +96,9 @@ ENTRY(efi_call2) SAVE_XMM subq $32, %rsp mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -68,7 +109,9 @@ ENTRY(efi_call3) subq $32, %rsp mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -80,7 +123,9 @@ ENTRY(efi_call4) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $32, %rsp RESTORE_XMM ret @@ -93,7 +138,9 @@ ENTRY(efi_call5) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $48, %rsp RESTORE_XMM ret @@ -109,8 +156,15 @@ ENTRY(efi_call6) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx + SWITCH_PGT call *%rdi + RESTORE_PGT addq $48, %rsp RESTORE_XMM ret ENDPROC(efi_call6) + + .data +ENTRY(efi_scratch) + .fill 3,8,0 + .byte 0 diff --git a/include/linux/efi.h b/include/linux/efi.h index bc5687d0f315..6c0ca528300c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -653,6 +653,7 @@ extern int __init efi_setup_pcdp_console(char *); #define EFI_RUNTIME_SERVICES 3 /* Can we use runtime services? */ #define EFI_MEMMAP 4 /* Can we use EFI memory map? */ #define EFI_64BIT 5 /* Is the firmware 64-bit? */ +#define EFI_ARCH_1 6 /* First arch-specific bit */ #ifdef CONFIG_EFI # ifdef CONFIG_X86 From ee41143027706d9f342dfe05487a00b20887fde7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 31 Oct 2013 17:25:09 +0100 Subject: [PATCH 11/24] x86/efi: Check krealloc return value Check it just in case. We might just as well panic there because runtime won't be functioning anyway. Signed-off-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index b453069236fd..3fac4dee492f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -860,6 +860,9 @@ void __init efi_enter_virtual_mode(void) new_memmap = krealloc(new_memmap, (count + 1) * memmap.desc_size, GFP_KERNEL); + if (!new_memmap) + goto err_out; + memcpy(new_memmap + (count * memmap.desc_size), md, memmap.desc_size); count++; @@ -914,6 +917,11 @@ void __init efi_enter_virtual_mode(void) EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); + + return; + + err_out: + pr_err("Error reallocating memory, EFI runtime non-functional!\n"); } /* From 2da6e57cce14a1c3b0692d6f877b72e185110e2e Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:13 +0800 Subject: [PATCH 12/24] x86/efi: Remove unused variables in __map_region() variables size and end is useless in this function, thus remove them. Reported-by: Toshi Kani Tested-by: Toshi Kani Signed-off-by: Dave Young Acked-by: Borislav Petkov Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi_64.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index bf286c386d33..c5a6491d95da 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -148,15 +148,11 @@ void efi_setup_page_tables(void) static void __init __map_region(efi_memory_desc_t *md, u64 va) { pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); - unsigned long pf = 0, size; - u64 end; + unsigned long pf = 0; if (!(md->attribute & EFI_MEMORY_WB)) pf |= _PAGE_PCD; - size = md->num_pages << PAGE_SHIFT; - end = va + size; - if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf)) pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", md->phys_addr, va); From 3b2664964bc886ae9d5127c8d3708b1acc0626d2 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:14 +0800 Subject: [PATCH 13/24] x86/efi: Add a wrapper function efi_map_region_fixed() Kexec kernel will use saved runtime virtual mapping, so add a new function efi_map_region_fixed() for directly mapping a md to md->virt. The md is passed in from 1st kernel, the virtual addr is saved in md->virt_addr. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/platform/efi/efi_32.c | 2 ++ arch/x86/platform/efi/efi_64.c | 10 ++++++++++ 3 files changed, 13 insertions(+) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 89a05b0507b9..9fbaeb239bde 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -128,6 +128,7 @@ extern void efi_call_phys_epilog(void); extern void efi_unmap_memmap(void); extern void efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); +extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index e94557cf5487..7b3ec6ed99af 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -47,6 +47,8 @@ void __init efi_map_region(efi_memory_desc_t *md) old_map_region(md); } +void __init efi_map_region_fixed(efi_memory_desc_t *md) {} + void efi_call_phys_prelog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c5a6491d95da..ff08cb19630b 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -199,6 +199,16 @@ void __init efi_map_region(efi_memory_desc_t *md) md->virt_addr = efi_va; } +/* + * kexec kernel will use efi_map_region_fixed to map efi runtime memory ranges. + * md->virt_addr is the original virtual address which had been mapped in kexec + * 1st kernel. + */ +void __init efi_map_region_fixed(efi_memory_desc_t *md) +{ + __map_region(md, md->virt_addr); +} + void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, u32 type, u64 attribute) { From a7f84f03f660d93574ac88835d056c0d6468aebe Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:15 +0800 Subject: [PATCH 14/24] x86/efi: Fix off-by-one bug in EFI Boot Services reservation Current code check boot service region with kernel text region by: start+size >= __pa_symbol(_text) The end of the above region should be start + size - 1 instead. I see this problem in ovmf + Fedora 19 grub boot: text start: 1000000 md start: 800000 md size: 800000 Signed-off-by: Dave Young Acked-by: Borislav Petkov Acked-by: Toshi Kani Tested-by: Toshi Kani Cc: stable@vger.kernel.org Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f8ec4dafc74e..15e3b5ea31b5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -438,7 +438,7 @@ void __init efi_reserve_boot_services(void) * - Not within any part of the kernel * - Not the bios reserved area */ - if ((start+size >= __pa_symbol(_text) + if ((start + size > __pa_symbol(_text) && start <= __pa_symbol(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || memblock_is_region_reserved(start, size)) { From 481f75c043cf44ec11c7fbdbbf37d43463f1e719 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:16 +0800 Subject: [PATCH 15/24] x86/efi: Cleanup efi_enter_virtual_mode() function Add two small functions: efi_merge_regions() and efi_map_regions(), efi_enter_virtual_mode() calls them instead of embedding two long for loop. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 140 +++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 57 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 15e3b5ea31b5..4694632ef581 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -773,44 +773,12 @@ void __init old_map_region(efi_memory_desc_t *md) (unsigned long long)md->phys_addr); } -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, we look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor into the - * ->trampoline_pgd page table using a top-down VA allocation scheme. - * - * The old method which used to update that memory descriptor with the - * virtual address obtained from ioremap() is still supported when the - * kernel is booted with efi=old_map on its command line. Same old - * method enabled the runtime services to be called without having to - * thunk back into physical mode for every invocation. - * - * The new method does a pagetable switch in a preemption-safe manner - * so that we're in a different address space when calling a runtime - * function. For function arguments passing we do copy the PGDs of the - * kernel page table into ->trampoline_pgd prior to each call. - */ -void __init efi_enter_virtual_mode(void) +/* Merge contiguous regions of the same type and attribute */ +static void __init efi_merge_regions(void) { + void *p; efi_memory_desc_t *md, *prev_md = NULL; - void *p, *new_memmap = NULL; - unsigned long size; - efi_status_t status; - u64 end, systab; - int count = 0; - efi.systab = NULL; - - /* - * We don't do virtual mode, since we don't do runtime services, on - * non-native EFI - */ - if (!efi_is_native()) { - efi_unmap_memmap(); - return; - } - - /* Merge contiguous regions of the same type and attribute */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { u64 prev_size; md = p; @@ -835,8 +803,31 @@ void __init efi_enter_virtual_mode(void) continue; } prev_md = md; - } +} + +static void __init get_systab_virt_addr(efi_memory_desc_t *md) +{ + unsigned long size; + u64 end, systab; + + size = md->num_pages << EFI_PAGE_SHIFT; + end = md->phys_addr + size; + systab = (u64)(unsigned long)efi_phys.systab; + if (md->phys_addr <= systab && systab < end) { + systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *)(unsigned long)systab; + } +} + +/* + * Map efi memory ranges for runtime serivce and update new_memmap with virtual + * addresses. + */ +static void * __init efi_map_regions(int *count) +{ + efi_memory_desc_t *md; + void *p, *tmp, *new_memmap = NULL; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; @@ -849,26 +840,64 @@ void __init efi_enter_virtual_mode(void) } efi_map_region(md); + get_systab_virt_addr(md); - size = md->num_pages << EFI_PAGE_SHIFT; - end = md->phys_addr + size; - - systab = (u64) (unsigned long) efi_phys.systab; - if (md->phys_addr <= systab && systab < end) { - systab += md->virt_addr - md->phys_addr; - - efi.systab = (efi_system_table_t *) (unsigned long) systab; - } - - new_memmap = krealloc(new_memmap, - (count + 1) * memmap.desc_size, - GFP_KERNEL); - if (!new_memmap) - goto err_out; - - memcpy(new_memmap + (count * memmap.desc_size), md, + tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size, + GFP_KERNEL); + if (!tmp) + goto out_krealloc; + new_memmap = tmp; + memcpy(new_memmap + (*count * memmap.desc_size), md, memmap.desc_size); - count++; + (*count)++; + } + + return new_memmap; +out_krealloc: + kfree(new_memmap); + return NULL; +} + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, we look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor into the + * ->trampoline_pgd page table using a top-down VA allocation scheme. + * + * The old method which used to update that memory descriptor with the + * virtual address obtained from ioremap() is still supported when the + * kernel is booted with efi=old_map on its command line. Same old + * method enabled the runtime services to be called without having to + * thunk back into physical mode for every invocation. + * + * The new method does a pagetable switch in a preemption-safe manner + * so that we're in a different address space when calling a runtime + * function. For function arguments passing we do copy the PGDs of the + * kernel page table into ->trampoline_pgd prior to each call. + */ +void __init efi_enter_virtual_mode(void) +{ + efi_status_t status; + void *new_memmap = NULL; + int count = 0; + + efi.systab = NULL; + + /* + * We don't do virtual mode, since we don't do runtime services, on + * non-native EFI + */ + if (!efi_is_native()) { + efi_unmap_memmap(); + return; + } + + efi_merge_regions(); + + new_memmap = efi_map_regions(&count); + if (!new_memmap) { + pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + return; } BUG_ON(!efi.systab); @@ -922,9 +951,6 @@ void __init efi_enter_virtual_mode(void) 0, NULL); return; - - err_out: - pr_err("Error reallocating memory, EFI runtime non-functional!\n"); } /* From a0998eb15afeffbf52a2c2829318f67df9ac57b8 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:17 +0800 Subject: [PATCH 16/24] efi: Export more EFI table variables to sysfs Export fw_vendor, runtime and config table physical addresses to /sys/firmware/efi/{fw_vendor,runtime,config_table} because kexec kernels need them. From EFI spec these 3 variables will be updated to virtual address after entering virtual mode. But kernel startup code will need the physical address. Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- Documentation/ABI/testing/sysfs-firmware-efi | 20 ++++++++++ arch/x86/platform/efi/efi.c | 4 ++ drivers/firmware/efi/efi.c | 41 +++++++++++++++++++- include/linux/efi.h | 3 ++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-efi diff --git a/Documentation/ABI/testing/sysfs-firmware-efi b/Documentation/ABI/testing/sysfs-firmware-efi new file mode 100644 index 000000000000..05874da7ce80 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-efi @@ -0,0 +1,20 @@ +What: /sys/firmware/efi/fw_vendor +Date: December 2013 +Contact: Dave Young +Description: It shows the physical address of firmware vendor field in the + EFI system table. +Users: Kexec + +What: /sys/firmware/efi/runtime +Date: December 2013 +Contact: Dave Young +Description: It shows the physical address of runtime service table entry in + the EFI system table. +Users: Kexec + +What: /sys/firmware/efi/config_table +Date: December 2013 +Contact: Dave Young +Description: It shows the physical address of config table entry in the EFI + system table. +Users: Kexec diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 4694632ef581..28591072fbb7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -653,6 +653,10 @@ void __init efi_init(void) set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility); + efi.config_table = (unsigned long)efi.systab->tables; + efi.fw_vendor = (unsigned long)efi.systab->fw_vendor; + efi.runtime = (unsigned long)efi.systab->runtime; + /* * Show what we know for posterity */ diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 2e2fbdec0845..72533af72b98 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -32,6 +32,9 @@ struct efi __read_mostly efi = { .hcdp = EFI_INVALID_TABLE_ADDR, .uga = EFI_INVALID_TABLE_ADDR, .uv_systab = EFI_INVALID_TABLE_ADDR, + .fw_vendor = EFI_INVALID_TABLE_ADDR, + .runtime = EFI_INVALID_TABLE_ADDR, + .config_table = EFI_INVALID_TABLE_ADDR, }; EXPORT_SYMBOL(efi); @@ -71,13 +74,49 @@ static ssize_t systab_show(struct kobject *kobj, static struct kobj_attribute efi_attr_systab = __ATTR(systab, 0400, systab_show, NULL); +#define EFI_FIELD(var) efi.var + +#define EFI_ATTR_SHOW(name) \ +static ssize_t name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sprintf(buf, "0x%lx\n", EFI_FIELD(name)); \ +} + +EFI_ATTR_SHOW(fw_vendor); +EFI_ATTR_SHOW(runtime); +EFI_ATTR_SHOW(config_table); + +static struct kobj_attribute efi_attr_fw_vendor = __ATTR_RO(fw_vendor); +static struct kobj_attribute efi_attr_runtime = __ATTR_RO(runtime); +static struct kobj_attribute efi_attr_config_table = __ATTR_RO(config_table); + static struct attribute *efi_subsys_attrs[] = { &efi_attr_systab.attr, - NULL, /* maybe more in the future? */ + &efi_attr_fw_vendor.attr, + &efi_attr_runtime.attr, + &efi_attr_config_table.attr, + NULL, }; +static umode_t efi_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + umode_t mode = attr->mode; + + if (attr == &efi_attr_fw_vendor.attr) + return (efi.fw_vendor == EFI_INVALID_TABLE_ADDR) ? 0 : mode; + else if (attr == &efi_attr_runtime.attr) + return (efi.runtime == EFI_INVALID_TABLE_ADDR) ? 0 : mode; + else if (attr == &efi_attr_config_table.attr) + return (efi.config_table == EFI_INVALID_TABLE_ADDR) ? 0 : mode; + + return mode; +} + static struct attribute_group efi_subsys_attr_group = { .attrs = efi_subsys_attrs, + .is_visible = efi_attr_is_visible, }; static struct efivars generic_efivars; diff --git a/include/linux/efi.h b/include/linux/efi.h index 6c0ca528300c..fb60b10b7bd9 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -556,6 +556,9 @@ extern struct efi { unsigned long hcdp; /* HCDP table */ unsigned long uga; /* UGA table */ unsigned long uv_systab; /* UV system table */ + unsigned long fw_vendor; /* fw_vendor */ + unsigned long runtime; /* runtime table */ + unsigned long config_table; /* config tables */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; From 926172d46038d7610b6b8d84e40db727cefb482d Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:18 +0800 Subject: [PATCH 17/24] efi: Export EFI runtime memory mapping to sysfs kexec kernel will need exactly same mapping for EFI runtime memory ranges. Thus here export the runtime ranges mapping to sysfs, kexec-tools will assemble them and pass to 2nd kernel via setup_data. Introducing a new directory /sys/firmware/efi/runtime-map just like /sys/firmware/memmap. Containing below attribute in each file of that directory: attribute num_pages phys_addr type virt_addr Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- .../testing/sysfs-firmware-efi-runtime-map | 34 ++++ arch/x86/platform/efi/efi.c | 46 ++++- drivers/firmware/efi/Kconfig | 11 ++ drivers/firmware/efi/Makefile | 1 + drivers/firmware/efi/efi.c | 4 + drivers/firmware/efi/runtime-map.c | 181 ++++++++++++++++++ include/linux/efi.h | 13 ++ 7 files changed, 287 insertions(+), 3 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-efi-runtime-map create mode 100644 drivers/firmware/efi/runtime-map.c diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-runtime-map b/Documentation/ABI/testing/sysfs-firmware-efi-runtime-map new file mode 100644 index 000000000000..c61b9b348e99 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-efi-runtime-map @@ -0,0 +1,34 @@ +What: /sys/firmware/efi/runtime-map/ +Date: December 2013 +Contact: Dave Young +Description: Switching efi runtime services to virtual mode requires + that all efi memory ranges which have the runtime attribute + bit set to be mapped to virtual addresses. + + The efi runtime services can only be switched to virtual + mode once without rebooting. The kexec kernel must maintain + the same physical to virtual address mappings as the first + kernel. The mappings are exported to sysfs so userspace tools + can reassemble them and pass them into the kexec kernel. + + /sys/firmware/efi/runtime-map/ is the directory the kernel + exports that information in. + + subdirectories are named with the number of the memory range: + + /sys/firmware/efi/runtime-map/0 + /sys/firmware/efi/runtime-map/1 + /sys/firmware/efi/runtime-map/2 + /sys/firmware/efi/runtime-map/3 + ... + + Each subdirectory contains five files: + + attribute : The attributes of the memory range. + num_pages : The size of the memory range in pages. + phys_addr : The physical address of the memory range. + type : The type of the memory range. + virt_addr : The virtual address of the memory range. + + Above values are all hexadecimal numbers with the '0x' prefix. +Users: Kexec diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 28591072fbb7..74fe7a719508 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -76,6 +76,9 @@ static __initdata efi_config_table_type_t arch_tables[] = { {NULL_GUID, NULL, NULL}, }; +static void *efi_runtime_map; +static int nr_efi_runtime_map; + /* * Returns 1 if 'facility' is enabled, 0 otherwise. */ @@ -824,6 +827,39 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) } } +static int __init save_runtime_map(void) +{ + efi_memory_desc_t *md; + void *tmp, *p, *q = NULL; + int count = 0; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + + if (!(md->attribute & EFI_MEMORY_RUNTIME) || + (md->type == EFI_BOOT_SERVICES_CODE) || + (md->type == EFI_BOOT_SERVICES_DATA)) + continue; + tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL); + if (!tmp) + goto out; + q = tmp; + + memcpy(q + count * memmap.desc_size, md, memmap.desc_size); + count++; + } + + efi_runtime_map = q; + nr_efi_runtime_map = count; + efi_runtime_map_setup(efi_runtime_map, nr_efi_runtime_map, + boot_params.efi_info.efi_memdesc_size); + + return 0; +out: + kfree(q); + return -ENOMEM; +} + /* * Map efi memory ranges for runtime serivce and update new_memmap with virtual * addresses. @@ -849,7 +885,7 @@ static void * __init efi_map_regions(int *count) tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size, GFP_KERNEL); if (!tmp) - goto out_krealloc; + goto out; new_memmap = tmp; memcpy(new_memmap + (*count * memmap.desc_size), md, memmap.desc_size); @@ -857,7 +893,7 @@ static void * __init efi_map_regions(int *count) } return new_memmap; -out_krealloc: +out: kfree(new_memmap); return NULL; } @@ -883,7 +919,7 @@ void __init efi_enter_virtual_mode(void) { efi_status_t status; void *new_memmap = NULL; - int count = 0; + int err, count = 0; efi.systab = NULL; @@ -904,6 +940,10 @@ void __init efi_enter_virtual_mode(void) return; } + err = save_runtime_map(); + if (err) + pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n"); + BUG_ON(!efi.systab); efi_setup_page_tables(); diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 3150aa4874e8..730f5f2e8b7f 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -39,4 +39,15 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE config UEFI_CPER def_bool n +config EFI_RUNTIME_MAP + bool "Export efi runtime maps to sysfs" + depends on X86 && EFI && KEXEC + default y + help + Export efi runtime memory maps to /sys/firmware/efi/runtime-map. + That memory map is used for example by kexec to set up efi virtual + mapping the 2nd kernel, but can also be used for debugging purposes. + + See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map. + endmenu diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 9ba156d3c775..a58e0f183a08 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -5,3 +5,4 @@ obj-y += efi.o vars.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o obj-$(CONFIG_UEFI_CPER) += cper.o +obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 72533af72b98..4753bac65279 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -167,6 +167,10 @@ static int __init efisubsys_init(void) goto err_unregister; } + error = efi_runtime_map_init(efi_kobj); + if (error) + goto err_remove_group; + /* and the standard mountpoint for efivarfs */ efivars_kobj = kobject_create_and_add("efivars", efi_kobj); if (!efivars_kobj) { diff --git a/drivers/firmware/efi/runtime-map.c b/drivers/firmware/efi/runtime-map.c new file mode 100644 index 000000000000..97cdd16a2169 --- /dev/null +++ b/drivers/firmware/efi/runtime-map.c @@ -0,0 +1,181 @@ +/* + * linux/drivers/efi/runtime-map.c + * Copyright (C) 2013 Red Hat, Inc., Dave Young + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +#include + +static void *efi_runtime_map; +static int nr_efi_runtime_map; +static u32 efi_memdesc_size; + +struct efi_runtime_map_entry { + efi_memory_desc_t md; + struct kobject kobj; /* kobject for each entry */ +}; + +static struct efi_runtime_map_entry **map_entries; + +struct map_attribute { + struct attribute attr; + ssize_t (*show)(struct efi_runtime_map_entry *entry, char *buf); +}; + +static inline struct map_attribute *to_map_attr(struct attribute *attr) +{ + return container_of(attr, struct map_attribute, attr); +} + +static ssize_t type_show(struct efi_runtime_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%x\n", entry->md.type); +} + +#define EFI_RUNTIME_FIELD(var) entry->md.var + +#define EFI_RUNTIME_U64_ATTR_SHOW(name) \ +static ssize_t name##_show(struct efi_runtime_map_entry *entry, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "0x%llx\n", EFI_RUNTIME_FIELD(name)); \ +} + +EFI_RUNTIME_U64_ATTR_SHOW(phys_addr); +EFI_RUNTIME_U64_ATTR_SHOW(virt_addr); +EFI_RUNTIME_U64_ATTR_SHOW(num_pages); +EFI_RUNTIME_U64_ATTR_SHOW(attribute); + +static inline struct efi_runtime_map_entry *to_map_entry(struct kobject *kobj) +{ + return container_of(kobj, struct efi_runtime_map_entry, kobj); +} + +static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct efi_runtime_map_entry *entry = to_map_entry(kobj); + struct map_attribute *map_attr = to_map_attr(attr); + + return map_attr->show(entry, buf); +} + +static struct map_attribute map_type_attr = __ATTR_RO(type); +static struct map_attribute map_phys_addr_attr = __ATTR_RO(phys_addr); +static struct map_attribute map_virt_addr_attr = __ATTR_RO(virt_addr); +static struct map_attribute map_num_pages_attr = __ATTR_RO(num_pages); +static struct map_attribute map_attribute_attr = __ATTR_RO(attribute); + +/* + * These are default attributes that are added for every memmap entry. + */ +static struct attribute *def_attrs[] = { + &map_type_attr.attr, + &map_phys_addr_attr.attr, + &map_virt_addr_attr.attr, + &map_num_pages_attr.attr, + &map_attribute_attr.attr, + NULL +}; + +static const struct sysfs_ops map_attr_ops = { + .show = map_attr_show, +}; + +static void map_release(struct kobject *kobj) +{ + struct efi_runtime_map_entry *entry; + + entry = to_map_entry(kobj); + kfree(entry); +} + +static struct kobj_type __refdata map_ktype = { + .sysfs_ops = &map_attr_ops, + .default_attrs = def_attrs, + .release = map_release, +}; + +static struct kset *map_kset; + +static struct efi_runtime_map_entry * +add_sysfs_runtime_map_entry(struct kobject *kobj, int nr) +{ + int ret; + struct efi_runtime_map_entry *entry; + + if (!map_kset) { + map_kset = kset_create_and_add("runtime-map", NULL, kobj); + if (!map_kset) + return ERR_PTR(-ENOMEM); + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + kset_unregister(map_kset); + return entry; + } + + memcpy(&entry->md, efi_runtime_map + nr * efi_memdesc_size, + sizeof(efi_memory_desc_t)); + + kobject_init(&entry->kobj, &map_ktype); + entry->kobj.kset = map_kset; + ret = kobject_add(&entry->kobj, NULL, "%d", nr); + if (ret) { + kobject_put(&entry->kobj); + kset_unregister(map_kset); + return ERR_PTR(ret); + } + + return entry; +} + +void efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) +{ + efi_runtime_map = map; + nr_efi_runtime_map = nr_entries; + efi_memdesc_size = desc_size; +} + +int __init efi_runtime_map_init(struct kobject *efi_kobj) +{ + int i, j, ret = 0; + struct efi_runtime_map_entry *entry; + + if (!efi_runtime_map) + return 0; + + map_entries = kzalloc(nr_efi_runtime_map * sizeof(entry), GFP_KERNEL); + if (!map_entries) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < nr_efi_runtime_map; i++) { + entry = add_sysfs_runtime_map_entry(efi_kobj, i); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto out_add_entry; + } + *(map_entries + i) = entry; + } + + return 0; +out_add_entry: + for (j = i - 1; j > 0; j--) { + entry = *(map_entries + j); + kobject_put(&entry->kobj); + } + if (map_kset) + kset_unregister(map_kset); +out: + return ret; +} diff --git a/include/linux/efi.h b/include/linux/efi.h index fb60b10b7bd9..e64540746c63 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -872,4 +872,17 @@ int efivars_sysfs_init(void); #endif /* CONFIG_EFI_VARS */ +#ifdef CONFIG_EFI_RUNTIME_MAP +int efi_runtime_map_init(struct kobject *); +void efi_runtime_map_setup(void *, int, u32); +#else +static inline int efi_runtime_map_init(struct kobject *kobj) +{ + return 0; +} + +static inline void +efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) {} +#endif + #endif /* _LINUX_EFI_H */ From 1fec0533693cd74f2d1a46edd29449cfee429df0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:19 +0800 Subject: [PATCH 18/24] x86/efi: Pass necessary EFI data for kexec via setup_data Add a new setup_data type SETUP_EFI for kexec use. Passing the saved fw_vendor, runtime, config tables and EFI runtime mappings. When entering virtual mode, directly mapping the EFI runtime regions which we passed in previously. And skip the step to call SetVirtualAddressMap(). Specially for HP z420 workstation we need save the smbios physical address. The kernel boot sequence proceeds in the following order. Step 2 requires efi.smbios to be the physical address. However, I found that on HP z420 EFI system table has a virtual address of SMBIOS in step 1. Hence, we need set it back to the physical address with the smbios in efi_setup_data. (When it is still the physical address, it simply sets the same value.) 1. efi_init() - Set efi.smbios from EFI system table 2. dmi_scan_machine() - Temporary map efi.smbios to access SMBIOS table 3. efi_enter_virtual_mode() - Map EFI ranges Tested on ovmf+qemu, lenovo thinkpad, a dell laptop and an HP z420 workstation. Signed-off-by: Dave Young Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 12 ++ arch/x86/include/uapi/asm/bootparam.h | 1 + arch/x86/kernel/setup.c | 3 + arch/x86/platform/efi/efi.c | 156 ++++++++++++++++++++++---- arch/x86/platform/efi/efi_32.c | 1 + arch/x86/platform/efi/efi_64.c | 6 + 6 files changed, 155 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9fbaeb239bde..4d1ba80b6ff1 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -133,6 +133,18 @@ extern void efi_sync_low_kernel_mappings(void); extern void efi_setup_page_tables(void); extern void __init old_map_region(efi_memory_desc_t *md); +struct efi_setup_data { + u64 fw_vendor; + u64 runtime; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +extern u64 efi_setup; +extern u32 efi_data_len; +extern void parse_efi_setup(u64 phys_addr, u32 data_len); + #ifdef CONFIG_EFI static inline bool efi_is_native(void) diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 9c3733c5f8f7..64fe421aab65 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -6,6 +6,7 @@ #define SETUP_E820_EXT 1 #define SETUP_DTB 2 #define SETUP_PCI 3 +#define SETUP_EFI 4 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cb233bc9dee3..24536f7a0ae6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -447,6 +447,9 @@ static void __init parse_setup_data(void) case SETUP_DTB: add_dtb(pa_data); break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; default: break; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 74fe7a719508..9965ff403c6e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -78,6 +78,8 @@ static __initdata efi_config_table_type_t arch_tables[] = { static void *efi_runtime_map; static int nr_efi_runtime_map; +u64 efi_setup; /* efi setup_data physical address */ +u32 efi_data_len; /* efi setup_data payload length */ /* * Returns 1 if 'facility' is enabled, 0 otherwise. @@ -115,7 +117,6 @@ static int __init setup_storage_paranoia(char *arg) } early_param("efi_no_storage_paranoia", setup_storage_paranoia); - static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) { unsigned long flags; @@ -494,18 +495,27 @@ static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { efi_system_table_64_t *systab64; + struct efi_setup_data *data = NULL; u64 tmp = 0; + if (efi_setup) { + data = early_memremap(efi_setup, sizeof(*data)); + if (!data) + return -ENOMEM; + } systab64 = early_ioremap((unsigned long)phys, sizeof(*systab64)); if (systab64 == NULL) { pr_err("Couldn't map the system table!\n"); + if (data) + early_iounmap(data, sizeof(*data)); return -ENOMEM; } efi_systab.hdr = systab64->hdr; - efi_systab.fw_vendor = systab64->fw_vendor; - tmp |= systab64->fw_vendor; + efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor : + systab64->fw_vendor; + tmp |= data ? data->fw_vendor : systab64->fw_vendor; efi_systab.fw_revision = systab64->fw_revision; efi_systab.con_in_handle = systab64->con_in_handle; tmp |= systab64->con_in_handle; @@ -519,15 +529,20 @@ static int __init efi_systab_init(void *phys) tmp |= systab64->stderr_handle; efi_systab.stderr = systab64->stderr; tmp |= systab64->stderr; - efi_systab.runtime = (void *)(unsigned long)systab64->runtime; - tmp |= systab64->runtime; + efi_systab.runtime = data ? + (void *)(unsigned long)data->runtime : + (void *)(unsigned long)systab64->runtime; + tmp |= data ? data->runtime : systab64->runtime; efi_systab.boottime = (void *)(unsigned long)systab64->boottime; tmp |= systab64->boottime; efi_systab.nr_tables = systab64->nr_tables; - efi_systab.tables = systab64->tables; - tmp |= systab64->tables; + efi_systab.tables = data ? (unsigned long)data->tables : + systab64->tables; + tmp |= data ? data->tables : systab64->tables; early_iounmap(systab64, sizeof(*systab64)); + if (data) + early_iounmap(data, sizeof(*data)); #ifdef CONFIG_X86_32 if (tmp >> 32) { pr_err("EFI data located above 4GB, disabling EFI.\n"); @@ -631,6 +646,71 @@ static int __init efi_memmap_init(void) return 0; } +/* + * A number of config table entries get remapped to virtual addresses + * after entering EFI virtual mode. However, the kexec kernel requires + * their physical addresses therefore we pass them via setup_data and + * correct those entries to their respective physical addresses here. + * + * Currently only handles smbios which is necessary for some firmware + * implementation. + */ +static int __init efi_reuse_config(u64 tables, int nr_tables) +{ + int i, sz, ret = 0; + void *p, *tablep; + struct efi_setup_data *data; + + if (!efi_setup) + return 0; + + if (!efi_enabled(EFI_64BIT)) + return 0; + + data = early_memremap(efi_setup, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + + if (!data->smbios) + goto out_memremap; + + sz = sizeof(efi_config_table_64_t); + + p = tablep = early_memremap(tables, nr_tables * sz); + if (!p) { + pr_err("Could not map Configuration table!\n"); + ret = -ENOMEM; + goto out_memremap; + } + + for (i = 0; i < efi.systab->nr_tables; i++) { + efi_guid_t guid; + + guid = ((efi_config_table_64_t *)p)->guid; + + if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) + ((efi_config_table_64_t *)p)->table = data->smbios; + p += sz; + } + early_iounmap(tablep, nr_tables * sz); + +out_memremap: + early_iounmap(data, sizeof(*data)); +out: + return ret; +} + +static void get_nr_runtime_map(void) +{ + if (!efi_setup) + return; + + nr_efi_runtime_map = (efi_data_len - sizeof(struct efi_setup_data)) / + sizeof(efi_memory_desc_t); +} + void __init efi_init(void) { efi_char16_t *c16; @@ -638,6 +718,7 @@ void __init efi_init(void) int i = 0; void *tmp; + get_nr_runtime_map(); #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || boot_params.efi_info.efi_memmap_hi) { @@ -676,6 +757,9 @@ void __init efi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); + if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables)) + return; + if (efi_config_init(arch_tables)) return; @@ -860,6 +944,23 @@ static int __init save_runtime_map(void) return -ENOMEM; } +/* + * Map efi regions which were passed via setup_data. The virt_addr is a fixed + * addr which was used in first kernel of a kexec boot. + */ +static void __init efi_map_regions_fixed(void) +{ + void *p; + efi_memory_desc_t *md; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + efi_map_region_fixed(md); /* FIXME: add error handling */ + get_systab_virt_addr(md); + } + +} + /* * Map efi memory ranges for runtime serivce and update new_memmap with virtual * addresses. @@ -914,6 +1015,10 @@ static void * __init efi_map_regions(int *count) * so that we're in a different address space when calling a runtime * function. For function arguments passing we do copy the PGDs of the * kernel page table into ->trampoline_pgd prior to each call. + * + * Specially for kexec boot, efi runtime maps in previous kernel should + * be passed in via setup_data. In that case runtime ranges will be mapped + * to the same virtual addresses as the first kernel. */ void __init efi_enter_virtual_mode(void) { @@ -932,12 +1037,15 @@ void __init efi_enter_virtual_mode(void) return; } - efi_merge_regions(); - - new_memmap = efi_map_regions(&count); - if (!new_memmap) { - pr_err("Error reallocating memory, EFI runtime non-functional!\n"); - return; + if (efi_setup) { + efi_map_regions_fixed(); + } else { + efi_merge_regions(); + new_memmap = efi_map_regions(&count); + if (!new_memmap) { + pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + return; + } } err = save_runtime_map(); @@ -949,16 +1057,18 @@ void __init efi_enter_virtual_mode(void) efi_setup_page_tables(); efi_sync_low_kernel_mappings(); - status = phys_efi_set_virtual_address_map( - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + if (!efi_setup) { + status = phys_efi_set_virtual_address_map( + memmap.desc_size * count, + memmap.desc_size, + memmap.desc_version, + (efi_memory_desc_t *)__pa(new_memmap)); - if (status != EFI_SUCCESS) { - pr_alert("Unable to switch EFI into virtual mode " - "(status=%lx)!\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); + if (status != EFI_SUCCESS) { + pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n", + status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } } /* @@ -993,8 +1103,6 @@ void __init efi_enter_virtual_mode(void) EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); - - return; } /* diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 7b3ec6ed99af..249b183cf417 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -48,6 +48,7 @@ void __init efi_map_region(efi_memory_desc_t *md) } void __init efi_map_region_fixed(efi_memory_desc_t *md) {} +void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} void efi_call_phys_prelog(void) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ff08cb19630b..324b65103851 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -228,3 +228,9 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, return (void __iomem *)__va(phys_addr); } + +void __init parse_efi_setup(u64 phys_addr, u32 data_len) +{ + efi_setup = phys_addr + sizeof(struct setup_data); + efi_data_len = data_len - sizeof(struct setup_data); +} From 456a29ddada79198c5965300e04103c40c481f62 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:20 +0800 Subject: [PATCH 19/24] x86: Add xloadflags bit for EFI runtime support on kexec Old kexec-tools can not load new kernels. The reason is kexec-tools does not fill efi_info in x86 setup header previously, thus EFI failed to initialize. In new kexec-tools it will by default to fill efi_info and pass other EFI required infomation to 2nd kernel so kexec kernel EFI initialization can succeed finally. To prevent from breaking userspace, add a new xloadflags bit so kexec-tools can check the flag and switch to old logic. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- Documentation/x86/boot.txt | 3 +++ arch/x86/boot/header.S | 9 ++++++++- arch/x86/include/uapi/asm/bootparam.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index f4f268c2b826..cb81741d3b0b 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -608,6 +608,9 @@ Protocol: 2.12+ - If 1, the kernel supports the 64-bit EFI handoff entry point given at handover_offset + 0x200. + Bit 4 (read): XLF_EFI_KEXEC + - If 1, the kernel supports kexec EFI boot with EFI runtime support. + Field name: cmdline_size Type: read Offset/size: 0x238/4 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9ec06a1f6d61..ec3b8ba68096 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -391,7 +391,14 @@ xloadflags: #else # define XLF23 0 #endif - .word XLF0 | XLF1 | XLF23 + +#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC) +# define XLF4 XLF_EFI_KEXEC +#else +# define XLF4 0 +#endif + + .word XLF0 | XLF1 | XLF23 | XLF4 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 64fe421aab65..225b0988043a 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -24,6 +24,7 @@ #define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1) #define XLF_EFI_HANDOVER_32 (1<<2) #define XLF_EFI_HANDOVER_64 (1<<3) +#define XLF_EFI_KEXEC (1<<4) #ifndef __ASSEMBLY__ From 5039e316dde3fb71c79e95e97c5bca8e4724d8f2 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:21 +0800 Subject: [PATCH 20/24] x86: Export x86 boot_params to sysfs kexec-tools use boot_params for getting the 1st kernel hardware_subarch, the kexec kernel EFI runtime support also needs to read the old efi_info from boot_params. Currently it exists in debugfs which is not a good place for such infomation. Per HPA, we should avoid "sploit debugfs". In this patch /sys/kernel/boot_params are exported, also the setup_data is exported as a subdirectory. kexec-tools is using debugfs for hardware_subarch for a long time now so we're not removing it yet. Structure is like below: /sys/kernel/boot_params |__ data /* boot_params in binary*/ |__ setup_data | |__ 0 /* the first setup_data node */ | | |__ data /* setup_data node 0 in binary*/ | | |__ type /* setup_data type of setup_data node 0, hex string */ [snip] |__ version /* boot protocal version (in hex, "0x" prefixed)*/ Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- .../ABI/testing/sysfs-kernel-boot_params | 38 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ksysfs.c | 339 ++++++++++++++++++ 3 files changed, 378 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-boot_params create mode 100644 arch/x86/kernel/ksysfs.c diff --git a/Documentation/ABI/testing/sysfs-kernel-boot_params b/Documentation/ABI/testing/sysfs-kernel-boot_params new file mode 100644 index 000000000000..eca38ce2852d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-boot_params @@ -0,0 +1,38 @@ +What: /sys/kernel/boot_params +Date: December 2013 +Contact: Dave Young +Description: The /sys/kernel/boot_params directory contains two + files: "data" and "version" and one subdirectory "setup_data". + It is used to export the kernel boot parameters of an x86 + platform to userspace for kexec and debugging purpose. + + If there's no setup_data in boot_params the subdirectory will + not be created. + + "data" file is the binary representation of struct boot_params. + + "version" file is the string representation of boot + protocol version. + + "setup_data" subdirectory contains the setup_data data + structure in boot_params. setup_data is maintained in kernel + as a link list. In "setup_data" subdirectory there's one + subdirectory for each link list node named with the number + of the list nodes. The list node subdirectory contains two + files "type" and "data". "type" file is the string + representation of setup_data type. "data" file is the binary + representation of setup_data payload. + + The whole boot_params directory structure is like below: + /sys/kernel/boot_params + |__ data + |__ setup_data + | |__ 0 + | | |__ data + | | |__ type + | |__ 1 + | |__ data + | |__ type + |__ version + +Users: Kexec diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b0a34e2cd79..510cca5c5390 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 000000000000..eb53d153f307 --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,339 @@ +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying + * Copyright (C) 2013, 2013 Red Hat, Inc. + * Dave Young + * + * This file is released under the GPLv2 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, (void *)&boot_params + off, count); + return count; +} + +static struct bin_attribute boot_params_data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = boot_params_data_read, + .size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { + &boot_params_version_attr.attr, + NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { + &boot_params_data_attr, + NULL, +}; + +static struct attribute_group boot_params_attr_group = { + .attrs = boot_params_version_attrs, + .bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ + const char *name; + + name = kobject_name(kobj); + return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + if (nr == i) { + *paddr = pa_data; + return 0; + } + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + if (nr == i) { + *size = data->len; + iounmap(data); + return 0; + } + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int nr, ret; + u64 paddr; + struct setup_data *data; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + ret = sprintf(buf, "0x%x\n", data->type); + iounmap(data); + return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, + struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, + loff_t off, size_t count) +{ + int nr, ret = 0; + u64 paddr; + struct setup_data *data; + void *p; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + if (off > data->len) { + ret = -EINVAL; + goto out; + } + + if (count > data->len - off) + count = data->len - off; + + if (!count) + goto out; + + ret = count; + p = ioremap_cache(paddr + sizeof(*data), data->len); + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(buf, p + off, count); + iounmap(p); +out: + iounmap(data); + return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { + &type_attr.attr, + NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { + &data_attr, + NULL, +}; + +static struct attribute_group setup_data_attr_group = { + .attrs = setup_data_type_attrs, + .bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, + struct kobject **kobjp, int nr) +{ + int ret = 0; + size_t size; + struct kobject *kobj; + char name[16]; /* should be enough for setup_data nodes numbers */ + snprintf(name, 16, "%d", nr); + + kobj = kobject_create_and_add(name, parent); + if (!kobj) + return -ENOMEM; + + ret = get_setup_data_size(nr, &size); + if (ret) + goto out_kobj; + + data_attr.size = size; + ret = sysfs_create_group(kobj, &setup_data_attr_group); + if (ret) + goto out_kobj; + *kobjp = kobj; + + return 0; +out_kobj: + kobject_put(kobj); + return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ + sysfs_remove_group(kobj, &setup_data_attr_group); + kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ + int ret = 0; + struct setup_data *data; + + *nr = 0; + while (pa_data) { + *nr += 1; + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + pa_data = data->next; + iounmap(data); + } + +out: + return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ + struct kobject *setup_data_kobj, **kobjp; + u64 pa_data; + int i, j, nr, ret = 0; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return 0; + + setup_data_kobj = kobject_create_and_add("setup_data", parent); + if (!setup_data_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = get_setup_data_total_num(pa_data, &nr); + if (ret) + goto out_setup_data_kobj; + + kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); + if (!kobjp) { + ret = -ENOMEM; + goto out_setup_data_kobj; + } + + for (i = 0; i < nr; i++) { + ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); + if (ret) + goto out_clean_nodes; + } + + kfree(kobjp); + return 0; + +out_clean_nodes: + for (j = i - 1; j > 0; j--) + cleanup_setup_data_node(*(kobjp + j)); + kfree(kobjp); +out_setup_data_kobj: + kobject_put(setup_data_kobj); +out: + return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ + int ret; + struct kobject *boot_params_kobj; + + boot_params_kobj = kobject_create_and_add("boot_params", + kernel_kobj); + if (!boot_params_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); + if (ret) + goto out_boot_params_kobj; + + ret = create_setup_data_nodes(boot_params_kobj); + if (ret) + goto out_create_group; + + return 0; +out_create_group: + sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: + kobject_put(boot_params_kobj); +out: + return ret; +} + +arch_initcall(boot_params_ksysfs_init); From 77ea8c948953a90401e436e7c05973b2d5529804 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 20 Dec 2013 18:02:22 +0800 Subject: [PATCH 21/24] x86: Reserve setup_data ranges late after parsing memmap cmdline Currently e820_reserve_setup_data() is called before parsing early params, it works in normal case. But for memmap=exactmap, the final memory ranges are created after parsing memmap= cmdline params, so the previous e820_reserve_setup_data() has no effect. For example, setup_data ranges will still be marked as normal system ram, thus when later sysfs driver ioremap them kernel will warn about mapping normal ram. This patch fix it by moving the e820_reserve_setup_data() callback after parsing early params so they can be set as reserved ranges and later ioremap will be fine with it. Signed-off-by: Dave Young Acked-by: Borislav Petkov Tested-by: Toshi Kani Signed-off-by: Matt Fleming --- arch/x86/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 24536f7a0ae6..be4b456e444b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -927,8 +927,6 @@ void __init setup_arch(char **cmdline_p) iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); - /* update the e820_saved too */ - e820_reserve_setup_data(); copy_edd(); @@ -990,6 +988,8 @@ void __init setup_arch(char **cmdline_p) early_dump_pci_devices(); #endif + /* update the e820_saved too */ + e820_reserve_setup_data(); finish_e820_parsing(); if (efi_enabled(EFI_BOOT)) From 518548abd61808ea1e31614ccbdae34d3c32dfa4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Sat, 21 Dec 2013 16:09:46 +0000 Subject: [PATCH 22/24] x86/efi: Delete superfluous global variables There's no need to save the runtime map details in global variables, the values are only required to pass to efi_runtime_map_setup(). And because 'nr_efi_runtime_map' isn't needed, get_nr_runtime_map() can be deleted along with 'efi_data_len'. Cc: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 18 +----------------- arch/x86/platform/efi/efi_64.c | 1 - 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 9965ff403c6e..7ed3ecfde98a 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -76,10 +76,7 @@ static __initdata efi_config_table_type_t arch_tables[] = { {NULL_GUID, NULL, NULL}, }; -static void *efi_runtime_map; -static int nr_efi_runtime_map; u64 efi_setup; /* efi setup_data physical address */ -u32 efi_data_len; /* efi setup_data payload length */ /* * Returns 1 if 'facility' is enabled, 0 otherwise. @@ -702,15 +699,6 @@ static int __init efi_reuse_config(u64 tables, int nr_tables) return ret; } -static void get_nr_runtime_map(void) -{ - if (!efi_setup) - return; - - nr_efi_runtime_map = (efi_data_len - sizeof(struct efi_setup_data)) / - sizeof(efi_memory_desc_t); -} - void __init efi_init(void) { efi_char16_t *c16; @@ -718,7 +706,6 @@ void __init efi_init(void) int i = 0; void *tmp; - get_nr_runtime_map(); #ifdef CONFIG_X86_32 if (boot_params.efi_info.efi_systab_hi || boot_params.efi_info.efi_memmap_hi) { @@ -933,10 +920,7 @@ static int __init save_runtime_map(void) count++; } - efi_runtime_map = q; - nr_efi_runtime_map = count; - efi_runtime_map_setup(efi_runtime_map, nr_efi_runtime_map, - boot_params.efi_info.efi_memdesc_size); + efi_runtime_map_setup(q, count, memmap.desc_size); return 0; out: diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 324b65103851..6284f158a47d 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -232,5 +232,4 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, void __init parse_efi_setup(u64 phys_addr, u32 data_len) { efi_setup = phys_addr + sizeof(struct setup_data); - efi_data_len = data_len - sizeof(struct setup_data); } From 41a34cec2e0df7798ea322ed1480bc3d3facdc8e Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 3 Jan 2014 11:54:31 +0800 Subject: [PATCH 23/24] x86: ksysfs.c build fix kbuild test robot report below error for randconfig: arch/x86/kernel/ksysfs.c: In function 'get_setup_data_paddr': arch/x86/kernel/ksysfs.c:81:3: error: implicit declaration of function 'ioremap_cache' [-Werror=implicit-function-declaration] arch/x86/kernel/ksysfs.c:86:3: error: implicit declaration of function 'iounmap' [-Werror=implicit-function-declaration] Fix it by including in ksysfs.c Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/kernel/ksysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index eb53d153f307..c2bedaea11f7 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -17,6 +17,7 @@ #include #include +#include #include static ssize_t version_show(struct kobject *kobj, From 5c12af0c41e3417e1939095325920463b5f8e726 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 3 Jan 2014 11:56:49 +0800 Subject: [PATCH 24/24] x86/efi: parse_efi_setup() build fix In case without CONFIG_EFI, there will be below build error: arch/x86/built-in.o: In function `setup_arch': (.init.text+0x9dc): undefined reference to `parse_efi_setup' Thus fix it by adding blank inline function in asm/efi.h Also remove an unused declaration for variable efi_data_len. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 4d1ba80b6ff1..3b978c472d08 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -142,8 +142,6 @@ struct efi_setup_data { }; extern u64 efi_setup; -extern u32 efi_data_len; -extern void parse_efi_setup(u64 phys_addr, u32 data_len); #ifdef CONFIG_EFI @@ -153,7 +151,7 @@ static inline bool efi_is_native(void) } extern struct console early_efi_console; - +extern void parse_efi_setup(u64 phys_addr, u32 data_len); #else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. @@ -165,6 +163,7 @@ extern struct console early_efi_console; #define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) #define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) #define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) +static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */