From 117cda9a7847286484d2353af64728a9875effd4 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:06 -0500 Subject: [PATCH 01/13] arm64: kexec: make dtb_mem always enabled Currently, dtb_mem is enabled only when CONFIG_KEXEC_FILE is enabled. This adds ugly ifdefs to c files. Always enabled dtb_mem, when it is not used, it is NULL. Change the dtb_mem to phys_addr_t, as it is a physical address. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-2-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kexec.h | 4 ++-- arch/arm64/kernel/machine_kexec.c | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index d24b527e8c00..61530ec3a9b1 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -90,18 +90,18 @@ static inline void crash_prepare_suspend(void) {} static inline void crash_post_resume(void) {} #endif -#ifdef CONFIG_KEXEC_FILE #define ARCH_HAS_KIMAGE_ARCH struct kimage_arch { void *dtb; - unsigned long dtb_mem; + phys_addr_t dtb_mem; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_mem; unsigned long elf_headers_sz; }; +#ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_image_ops; struct kimage; diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a0b144cfaea7..8096a6aa1d49 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -204,11 +204,7 @@ void machine_kexec(struct kimage *kimage) * In kexec_file case, the kernel starts directly without purgatory. */ cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, -#ifdef CONFIG_KEXEC_FILE - kimage->arch.dtb_mem); -#else - 0); -#endif + kimage->arch.dtb_mem); BUG(); /* Should never get here. */ } From 41f67d40a31d6b007d372461f171875fd940f17d Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:07 -0500 Subject: [PATCH 02/13] arm64: hibernate: variable pudp is used instead of pd4dp There should be p4dp used when p4d page is allocated. This is not a functional issue, but for the logical correctness this should be fixed. Fixes: e9f6376858b9 ("arm64: add support for folded p4d page tables") Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210125191923.1060122-3-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 9c9f47e9f7f4..0a54d81c90f9 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -190,10 +190,10 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgdp = pgd_offset_pgd(trans_pgd, dst_addr); if (pgd_none(READ_ONCE(*pgdp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); - if (!pudp) + p4dp = (void *)get_safe_page(GFP_ATOMIC); + if (!pgdp) return -ENOMEM; - pgd_populate(&init_mm, pgdp, pudp); + pgd_populate(&init_mm, pgdp, p4dp); } p4dp = p4d_offset(pgdp, dst_addr); From 072e3d96a79a5876691c6532f91f930157144a2a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:08 -0500 Subject: [PATCH 03/13] arm64: hibernate: move page handling function to new trans_pgd.c Now, that we abstracted the required functions move them to a new home. Later, we will generalize these function in order to be useful outside of hibernation. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-4-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 + arch/arm64/include/asm/trans_pgd.h | 21 +++ arch/arm64/kernel/hibernate.c | 228 +------------------------- arch/arm64/mm/Makefile | 1 + arch/arm64/mm/trans_pgd.c | 250 +++++++++++++++++++++++++++++ 5 files changed, 277 insertions(+), 227 deletions(-) create mode 100644 arch/arm64/include/asm/trans_pgd.h create mode 100644 arch/arm64/mm/trans_pgd.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f39568b28ec1..fc0ed9d6e011 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1132,6 +1132,10 @@ config CRASH_DUMP For more details see Documentation/admin-guide/kdump/kdump.rst +config TRANS_TABLE + def_bool y + depends on HIBERNATION + config XEN_DOM0 def_bool y depends on XEN diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h new file mode 100644 index 000000000000..23153c13d1ce --- /dev/null +++ b/arch/arm64/include/asm/trans_pgd.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin + */ + +#ifndef _ASM_TRANS_TABLE_H +#define _ASM_TRANS_TABLE_H + +#include +#include +#include + +int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, + unsigned long end); + +int trans_pgd_map_page(pgd_t *trans_pgd, void *page, unsigned long dst_addr, + pgprot_t pgprot); + +#endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 0a54d81c90f9..4a38662f0d90 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -16,7 +16,6 @@ #define pr_fmt(x) "hibernate: " x #include #include -#include #include #include #include @@ -31,13 +30,12 @@ #include #include #include -#include -#include #include #include #include #include #include +#include #include /* @@ -178,54 +176,6 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); -static int trans_pgd_map_page(pgd_t *trans_pgd, void *page, - unsigned long dst_addr, - pgprot_t pgprot) -{ - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - pgdp = pgd_offset_pgd(trans_pgd, dst_addr); - if (pgd_none(READ_ONCE(*pgdp))) { - p4dp = (void *)get_safe_page(GFP_ATOMIC); - if (!pgdp) - return -ENOMEM; - pgd_populate(&init_mm, pgdp, p4dp); - } - - p4dp = p4d_offset(pgdp, dst_addr); - if (p4d_none(READ_ONCE(*p4dp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); - if (!pudp) - return -ENOMEM; - p4d_populate(&init_mm, p4dp, pudp); - } - - pudp = pud_offset(p4dp, dst_addr); - if (pud_none(READ_ONCE(*pudp))) { - pmdp = (void *)get_safe_page(GFP_ATOMIC); - if (!pmdp) - return -ENOMEM; - pud_populate(&init_mm, pudp, pmdp); - } - - pmdp = pmd_offset(pudp, dst_addr); - if (pmd_none(READ_ONCE(*pmdp))) { - ptep = (void *)get_safe_page(GFP_ATOMIC); - if (!ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - - ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); - - return 0; -} - /* * Copies length bytes, starting at src_start into an new page, * perform cache maintenance, then maps it at the specified address low @@ -462,182 +412,6 @@ int swsusp_arch_suspend(void) return ret; } -static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) -{ - pte_t pte = READ_ONCE(*src_ptep); - - if (pte_valid(pte)) { - /* - * Resume will overwrite areas that may be marked - * read only (code, rodata). Clear the RDONLY bit from - * the temporary mappings we use during restore. - */ - set_pte(dst_ptep, pte_mkwrite(pte)); - } else if (debug_pagealloc_enabled() && !pte_none(pte)) { - /* - * debug_pagealloc will removed the PTE_VALID bit if - * the page isn't in use by the resume kernel. It may have - * been in use by the original kernel, in which case we need - * to put it back in our copy to do the restore. - * - * Before marking this entry valid, check the pfn should - * be mapped. - */ - BUG_ON(!pfn_valid(pte_pfn(pte))); - - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); - } -} - -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) -{ - pte_t *src_ptep; - pte_t *dst_ptep; - unsigned long addr = start; - - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_ptep) - return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); - dst_ptep = pte_offset_kernel(dst_pmdp, start); - - src_ptep = pte_offset_kernel(src_pmdp, start); - do { - _copy_pte(dst_ptep, src_ptep, addr); - } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); - - return 0; -} - -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) -{ - pmd_t *src_pmdp; - pmd_t *dst_pmdp; - unsigned long next; - unsigned long addr = start; - - if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmdp) - return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); - } - dst_pmdp = pmd_offset(dst_pudp, start); - - src_pmdp = pmd_offset(src_pudp, start); - do { - pmd_t pmd = READ_ONCE(*src_pmdp); - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - continue; - if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) - return -ENOMEM; - } else { - set_pmd(dst_pmdp, - __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); - } - } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); - - return 0; -} - -static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, - unsigned long end) -{ - pud_t *dst_pudp; - pud_t *src_pudp; - unsigned long next; - unsigned long addr = start; - - if (p4d_none(READ_ONCE(*dst_p4dp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pudp) - return -ENOMEM; - p4d_populate(&init_mm, dst_p4dp, dst_pudp); - } - dst_pudp = pud_offset(dst_p4dp, start); - - src_pudp = pud_offset(src_p4dp, start); - do { - pud_t pud = READ_ONCE(*src_pudp); - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - continue; - if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) - return -ENOMEM; - } else { - set_pud(dst_pudp, - __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); - } - } while (dst_pudp++, src_pudp++, addr = next, addr != end); - - return 0; -} - -static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, - unsigned long end) -{ - p4d_t *dst_p4dp; - p4d_t *src_p4dp; - unsigned long next; - unsigned long addr = start; - - dst_p4dp = p4d_offset(dst_pgdp, start); - src_p4dp = p4d_offset(src_pgdp, start); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(READ_ONCE(*src_p4dp))) - continue; - if (copy_pud(dst_p4dp, src_p4dp, addr, next)) - return -ENOMEM; - } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); - - return 0; -} - -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) -{ - unsigned long next; - unsigned long addr = start; - pgd_t *src_pgdp = pgd_offset_k(start); - - dst_pgdp = pgd_offset_pgd(dst_pgdp, start); - do { - next = pgd_addr_end(addr, end); - if (pgd_none(READ_ONCE(*src_pgdp))) - continue; - if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) - return -ENOMEM; - } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); - - return 0; -} - -static int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end) -{ - int rc; - pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); - - if (!trans_pgd) { - pr_err("Failed to allocate memory for temporary page tables.\n"); - return -ENOMEM; - } - - rc = copy_page_tables(trans_pgd, start, end); - if (!rc) - *dst_pgdp = trans_pgd; - - return rc; -} - /* * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit(). * diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3de3b6..77222d92667a 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -6,6 +6,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o +obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c new file mode 100644 index 000000000000..e048d1f5c912 --- /dev/null +++ b/arch/arm64/mm/trans_pgd.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Transitional page tables for kexec and hibernate + * + * This file derived from: arch/arm64/kernel/hibernate.c + * + * Copyright (c) 2020, Microsoft Corporation. + * Pavel Tatashin + * + */ + +/* + * Transitional tables are used during system transferring from one world to + * another: such as during hibernate restore, and kexec reboots. During these + * phases one cannot rely on page table not being overwritten. This is because + * hibernate and kexec can overwrite the current page tables during transition. + */ + +#include +#include +#include +#include +#include +#include +#include + +static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) +{ + pte_t pte = READ_ONCE(*src_ptep); + + if (pte_valid(pte)) { + /* + * Resume will overwrite areas that may be marked + * read only (code, rodata). Clear the RDONLY bit from + * the temporary mappings we use during restore. + */ + set_pte(dst_ptep, pte_mkwrite(pte)); + } else if (debug_pagealloc_enabled() && !pte_none(pte)) { + /* + * debug_pagealloc will removed the PTE_VALID bit if + * the page isn't in use by the resume kernel. It may have + * been in use by the original kernel, in which case we need + * to put it back in our copy to do the restore. + * + * Before marking this entry valid, check the pfn should + * be mapped. + */ + BUG_ON(!pfn_valid(pte_pfn(pte))); + + set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); + } +} + +static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, + unsigned long end) +{ + pte_t *src_ptep; + pte_t *dst_ptep; + unsigned long addr = start; + + dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!dst_ptep) + return -ENOMEM; + pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); + dst_ptep = pte_offset_kernel(dst_pmdp, start); + + src_ptep = pte_offset_kernel(src_pmdp, start); + do { + _copy_pte(dst_ptep, src_ptep, addr); + } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); + + return 0; +} + +static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, + unsigned long end) +{ + pmd_t *src_pmdp; + pmd_t *dst_pmdp; + unsigned long next; + unsigned long addr = start; + + if (pud_none(READ_ONCE(*dst_pudp))) { + dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pmdp) + return -ENOMEM; + pud_populate(&init_mm, dst_pudp, dst_pmdp); + } + dst_pmdp = pmd_offset(dst_pudp, start); + + src_pmdp = pmd_offset(src_pudp, start); + do { + pmd_t pmd = READ_ONCE(*src_pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + continue; + if (pmd_table(pmd)) { + if (copy_pte(dst_pmdp, src_pmdp, addr, next)) + return -ENOMEM; + } else { + set_pmd(dst_pmdp, + __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); + } + } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); + + return 0; +} + +static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, + unsigned long end) +{ + pud_t *dst_pudp; + pud_t *src_pudp; + unsigned long next; + unsigned long addr = start; + + if (p4d_none(READ_ONCE(*dst_p4dp))) { + dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pudp) + return -ENOMEM; + p4d_populate(&init_mm, dst_p4dp, dst_pudp); + } + dst_pudp = pud_offset(dst_p4dp, start); + + src_pudp = pud_offset(src_p4dp, start); + do { + pud_t pud = READ_ONCE(*src_pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + continue; + if (pud_table(pud)) { + if (copy_pmd(dst_pudp, src_pudp, addr, next)) + return -ENOMEM; + } else { + set_pud(dst_pudp, + __pud(pud_val(pud) & ~PUD_SECT_RDONLY)); + } + } while (dst_pudp++, src_pudp++, addr = next, addr != end); + + return 0; +} + +static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, + unsigned long end) +{ + p4d_t *dst_p4dp; + p4d_t *src_p4dp; + unsigned long next; + unsigned long addr = start; + + dst_p4dp = p4d_offset(dst_pgdp, start); + src_p4dp = p4d_offset(src_pgdp, start); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(READ_ONCE(*src_p4dp))) + continue; + if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + return -ENOMEM; + } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); + + return 0; +} + +static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, + unsigned long end) +{ + unsigned long next; + unsigned long addr = start; + pgd_t *src_pgdp = pgd_offset_k(start); + + dst_pgdp = pgd_offset_pgd(dst_pgdp, start); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(READ_ONCE(*src_pgdp))) + continue; + if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) + return -ENOMEM; + } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); + + return 0; +} + +int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, + unsigned long end) +{ + int rc; + pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); + + if (!trans_pgd) { + pr_err("Failed to allocate memory for temporary page tables.\n"); + return -ENOMEM; + } + + rc = copy_page_tables(trans_pgd, start, end); + if (!rc) + *dst_pgdp = trans_pgd; + + return rc; +} + +int trans_pgd_map_page(pgd_t *trans_pgd, void *page, + unsigned long dst_addr, + pgprot_t pgprot) +{ + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + pgdp = pgd_offset_pgd(trans_pgd, dst_addr); + if (pgd_none(READ_ONCE(*pgdp))) { + p4dp = (void *)get_safe_page(GFP_ATOMIC); + if (!pgdp) + return -ENOMEM; + pgd_populate(&init_mm, pgdp, p4dp); + } + + p4dp = p4d_offset(pgdp, dst_addr); + if (p4d_none(READ_ONCE(*p4dp))) { + pudp = (void *)get_safe_page(GFP_ATOMIC); + if (!pudp) + return -ENOMEM; + p4d_populate(&init_mm, p4dp, pudp); + } + + pudp = pud_offset(p4dp, dst_addr); + if (pud_none(READ_ONCE(*pudp))) { + pmdp = (void *)get_safe_page(GFP_ATOMIC); + if (!pmdp) + return -ENOMEM; + pud_populate(&init_mm, pudp, pmdp); + } + + pmdp = pmd_offset(pudp, dst_addr); + if (pmd_none(READ_ONCE(*pmdp))) { + ptep = (void *)get_safe_page(GFP_ATOMIC); + if (!ptep) + return -ENOMEM; + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + + ptep = pte_offset_kernel(pmdp, dst_addr); + set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); + + return 0; +} From 50f53fb721817a6efa541cca24f1b7caa84801c1 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:09 -0500 Subject: [PATCH 04/13] arm64: trans_pgd: make trans_pgd_map_page generic kexec is going to use a different allocator, so make trans_pgd_map_page to accept allocator as an argument, and also kexec is going to use a different map protection, so also pass it via argument. Signed-off-by: Pavel Tatashin Reviewed-by: Matthias Brugger Link: https://lore.kernel.org/r/20210125191923.1060122-5-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/trans_pgd.h | 19 +++++++++++++++++-- arch/arm64/kernel/hibernate.c | 12 +++++++++++- arch/arm64/mm/trans_pgd.c | 30 ++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h index 23153c13d1ce..b46409b25234 100644 --- a/arch/arm64/include/asm/trans_pgd.h +++ b/arch/arm64/include/asm/trans_pgd.h @@ -12,10 +12,25 @@ #include #include +/* + * trans_alloc_page + * - Allocator that should return exactly one zeroed page, if this + * allocator fails, trans_pgd_create_copy() and trans_pgd_map_page() + * return -ENOMEM error. + * + * trans_alloc_arg + * - Passed to trans_alloc_page as an argument + */ + +struct trans_pgd_info { + void * (*trans_alloc_page)(void *arg); + void *trans_alloc_arg; +}; + int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, unsigned long end); -int trans_pgd_map_page(pgd_t *trans_pgd, void *page, unsigned long dst_addr, - pgprot_t pgprot); +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot); #endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 4a38662f0d90..c173f280bfea 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -176,6 +176,11 @@ int arch_hibernation_header_restore(void *addr) } EXPORT_SYMBOL(arch_hibernation_header_restore); +static void *hibernate_page_alloc(void *arg) +{ + return (void *)get_safe_page((gfp_t)(unsigned long)arg); +} + /* * Copies length bytes, starting at src_start into an new page, * perform cache maintenance, then maps it at the specified address low @@ -192,6 +197,11 @@ static int create_safe_exec_page(void *src_start, size_t length, unsigned long dst_addr, phys_addr_t *phys_dst_addr) { + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; + void *page = (void *)get_safe_page(GFP_ATOMIC); pgd_t *trans_pgd; int rc; @@ -206,7 +216,7 @@ static int create_safe_exec_page(void *src_start, size_t length, if (!trans_pgd) return -ENOMEM; - rc = trans_pgd_map_page(trans_pgd, page, dst_addr, + rc = trans_pgd_map_page(&trans_info, trans_pgd, page, dst_addr, PAGE_KERNEL_EXEC); if (rc) return rc; diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index e048d1f5c912..f28eceba2242 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -25,6 +25,11 @@ #include #include +static void *trans_alloc(struct trans_pgd_info *info) +{ + return info->trans_alloc_page(info->trans_alloc_arg); +} + static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { pte_t pte = READ_ONCE(*src_ptep); @@ -201,9 +206,18 @@ int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, return rc; } -int trans_pgd_map_page(pgd_t *trans_pgd, void *page, - unsigned long dst_addr, - pgprot_t pgprot) +/* + * Add map entry to trans_pgd for a base-size page at PTE level. + * info: contains allocator and its argument + * trans_pgd: page table in which new map is added. + * page: page to be mapped. + * dst_addr: new VA address for the page + * pgprot: protection for the page. + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, + void *page, unsigned long dst_addr, pgprot_t pgprot) { pgd_t *pgdp; p4d_t *p4dp; @@ -213,7 +227,7 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pgdp = pgd_offset_pgd(trans_pgd, dst_addr); if (pgd_none(READ_ONCE(*pgdp))) { - p4dp = (void *)get_safe_page(GFP_ATOMIC); + p4dp = trans_alloc(info); if (!pgdp) return -ENOMEM; pgd_populate(&init_mm, pgdp, p4dp); @@ -221,7 +235,7 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, p4dp = p4d_offset(pgdp, dst_addr); if (p4d_none(READ_ONCE(*p4dp))) { - pudp = (void *)get_safe_page(GFP_ATOMIC); + pudp = trans_alloc(info); if (!pudp) return -ENOMEM; p4d_populate(&init_mm, p4dp, pudp); @@ -229,7 +243,7 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pudp = pud_offset(p4dp, dst_addr); if (pud_none(READ_ONCE(*pudp))) { - pmdp = (void *)get_safe_page(GFP_ATOMIC); + pmdp = trans_alloc(info); if (!pmdp) return -ENOMEM; pud_populate(&init_mm, pudp, pmdp); @@ -237,14 +251,14 @@ int trans_pgd_map_page(pgd_t *trans_pgd, void *page, pmdp = pmd_offset(pudp, dst_addr); if (pmd_none(READ_ONCE(*pmdp))) { - ptep = (void *)get_safe_page(GFP_ATOMIC); + ptep = trans_alloc(info); if (!ptep) return -ENOMEM; pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, dst_addr); - set_pte(ptep, pfn_pte(virt_to_pfn(page), PAGE_KERNEL_EXEC)); + set_pte(ptep, pfn_pte(virt_to_pfn(page), pgprot)); return 0; } From 89d1410f4af55c621395548355f3520415d2bcff Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:10 -0500 Subject: [PATCH 05/13] arm64: trans_pgd: pass allocator trans_pgd_create_copy Make trans_pgd_create_copy and its subroutines to use allocator that is passed as an argument Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-6-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/trans_pgd.h | 4 +-- arch/arm64/kernel/hibernate.c | 7 ++++- arch/arm64/mm/trans_pgd.c | 49 ++++++++++++++++++------------ 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h index b46409b25234..7fbf6a3ccff7 100644 --- a/arch/arm64/include/asm/trans_pgd.h +++ b/arch/arm64/include/asm/trans_pgd.h @@ -27,8 +27,8 @@ struct trans_pgd_info { void *trans_alloc_arg; }; -int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end); +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd, + unsigned long start, unsigned long end); int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, void *page, unsigned long dst_addr, pgprot_t pgprot); diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index c173f280bfea..94fc275cdd21 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -437,13 +437,18 @@ int swsusp_arch_resume(void) phys_addr_t phys_hibernate_exit; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); + struct trans_pgd_info trans_info = { + .trans_alloc_page = hibernate_page_alloc, + .trans_alloc_arg = (void *)GFP_ATOMIC, + }; /* * Restoring the memory image will overwrite the ttbr1 page tables. * Create a second copy of just the linear map, and use this when * restoring. */ - rc = trans_pgd_create_copy(&tmp_pg_dir, PAGE_OFFSET, PAGE_END); + rc = trans_pgd_create_copy(&trans_info, &tmp_pg_dir, PAGE_OFFSET, + PAGE_END); if (rc) return rc; diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index f28eceba2242..47b6b7029907 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -57,14 +57,14 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) } } -static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, - unsigned long end) +static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, + pmd_t *src_pmdp, unsigned long start, unsigned long end) { pte_t *src_ptep; pte_t *dst_ptep; unsigned long addr = start; - dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); + dst_ptep = trans_alloc(info); if (!dst_ptep) return -ENOMEM; pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); @@ -78,8 +78,8 @@ static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, return 0; } -static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, - unsigned long end) +static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, + pud_t *src_pudp, unsigned long start, unsigned long end) { pmd_t *src_pmdp; pmd_t *dst_pmdp; @@ -87,7 +87,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, unsigned long addr = start; if (pud_none(READ_ONCE(*dst_pudp))) { - dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); + dst_pmdp = trans_alloc(info); if (!dst_pmdp) return -ENOMEM; pud_populate(&init_mm, dst_pudp, dst_pmdp); @@ -102,7 +102,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, if (pmd_none(pmd)) continue; if (pmd_table(pmd)) { - if (copy_pte(dst_pmdp, src_pmdp, addr, next)) + if (copy_pte(info, dst_pmdp, src_pmdp, addr, next)) return -ENOMEM; } else { set_pmd(dst_pmdp, @@ -113,7 +113,8 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, return 0; } -static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, +static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, + p4d_t *src_p4dp, unsigned long start, unsigned long end) { pud_t *dst_pudp; @@ -122,7 +123,7 @@ static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, unsigned long addr = start; if (p4d_none(READ_ONCE(*dst_p4dp))) { - dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); + dst_pudp = trans_alloc(info); if (!dst_pudp) return -ENOMEM; p4d_populate(&init_mm, dst_p4dp, dst_pudp); @@ -137,7 +138,7 @@ static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, if (pud_none(pud)) continue; if (pud_table(pud)) { - if (copy_pmd(dst_pudp, src_pudp, addr, next)) + if (copy_pmd(info, dst_pudp, src_pudp, addr, next)) return -ENOMEM; } else { set_pud(dst_pudp, @@ -148,7 +149,8 @@ static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start, return 0; } -static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, +static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, + pgd_t *src_pgdp, unsigned long start, unsigned long end) { p4d_t *dst_p4dp; @@ -162,15 +164,15 @@ static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, next = p4d_addr_end(addr, end); if (p4d_none(READ_ONCE(*src_p4dp))) continue; - if (copy_pud(dst_p4dp, src_p4dp, addr, next)) + if (copy_pud(info, dst_p4dp, src_p4dp, addr, next)) return -ENOMEM; } while (dst_p4dp++, src_p4dp++, addr = next, addr != end); return 0; } -static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, - unsigned long end) +static int copy_page_tables(struct trans_pgd_info *info, pgd_t *dst_pgdp, + unsigned long start, unsigned long end) { unsigned long next; unsigned long addr = start; @@ -181,25 +183,34 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, next = pgd_addr_end(addr, end); if (pgd_none(READ_ONCE(*src_pgdp))) continue; - if (copy_p4d(dst_pgdp, src_pgdp, addr, next)) + if (copy_p4d(info, dst_pgdp, src_pgdp, addr, next)) return -ENOMEM; } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); return 0; } -int trans_pgd_create_copy(pgd_t **dst_pgdp, unsigned long start, - unsigned long end) +/* + * Create trans_pgd and copy linear map. + * info: contains allocator and its argument + * dst_pgdp: new page table that is created, and to which map is copied. + * start: Start of the interval (inclusive). + * end: End of the interval (exclusive). + * + * Returns 0 on success, and -ENOMEM on failure. + */ +int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **dst_pgdp, + unsigned long start, unsigned long end) { int rc; - pgd_t *trans_pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); + pgd_t *trans_pgd = trans_alloc(info); if (!trans_pgd) { pr_err("Failed to allocate memory for temporary page tables.\n"); return -ENOMEM; } - rc = copy_page_tables(trans_pgd, start, end); + rc = copy_page_tables(info, trans_pgd, start, end); if (!rc) *dst_pgdp = trans_pgd; From 5de59884ac0ec56007e4aa8226ee259aa669be80 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:11 -0500 Subject: [PATCH 06/13] arm64: trans_pgd: pass NULL instead of init_mm to *_populate functions trans_pgd_* should be independent from mm context because the tables that are created by this code are used when there are no mm context around, as it is between kernels. Simply replace mm_init's with NULL. Signed-off-by: Pavel Tatashin Acked-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-7-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/mm/trans_pgd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 47b6b7029907..ded8e2ba0308 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -67,7 +67,7 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp, dst_ptep = trans_alloc(info); if (!dst_ptep) return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); + pmd_populate_kernel(NULL, dst_pmdp, dst_ptep); dst_ptep = pte_offset_kernel(dst_pmdp, start); src_ptep = pte_offset_kernel(src_pmdp, start); @@ -90,7 +90,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp, dst_pmdp = trans_alloc(info); if (!dst_pmdp) return -ENOMEM; - pud_populate(&init_mm, dst_pudp, dst_pmdp); + pud_populate(NULL, dst_pudp, dst_pmdp); } dst_pmdp = pmd_offset(dst_pudp, start); @@ -126,7 +126,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp, dst_pudp = trans_alloc(info); if (!dst_pudp) return -ENOMEM; - p4d_populate(&init_mm, dst_p4dp, dst_pudp); + p4d_populate(NULL, dst_p4dp, dst_pudp); } dst_pudp = pud_offset(dst_p4dp, start); @@ -241,7 +241,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, p4dp = trans_alloc(info); if (!pgdp) return -ENOMEM; - pgd_populate(&init_mm, pgdp, p4dp); + pgd_populate(NULL, pgdp, p4dp); } p4dp = p4d_offset(pgdp, dst_addr); @@ -249,7 +249,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, pudp = trans_alloc(info); if (!pudp) return -ENOMEM; - p4d_populate(&init_mm, p4dp, pudp); + p4d_populate(NULL, p4dp, pudp); } pudp = pud_offset(p4dp, dst_addr); @@ -257,7 +257,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, pmdp = trans_alloc(info); if (!pmdp) return -ENOMEM; - pud_populate(&init_mm, pudp, pmdp); + pud_populate(NULL, pudp, pmdp); } pmdp = pmd_offset(pudp, dst_addr); @@ -265,7 +265,7 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, ptep = trans_alloc(info); if (!ptep) return -ENOMEM; - pmd_populate_kernel(&init_mm, pmdp, ptep); + pmd_populate_kernel(NULL, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, dst_addr); From 1401bef703a48cf79c674215f702a1435362beae Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 25 Jan 2021 14:19:12 -0500 Subject: [PATCH 07/13] arm64: mm: Always update TCR_EL1 from __cpu_set_tcr_t0sz() Because only the idmap sets a non-standard T0SZ, __cpu_set_tcr_t0sz() can check for platforms that need to do this using __cpu_uses_extended_idmap() before doing its work. The idmap is only built with enough levels, (and T0SZ bits) to map its single page. To allow hibernate, and then kexec to idmap their single page copy routines, __cpu_set_tcr_t0sz() needs to consider additional users, who may need a different number of levels/T0SZ-bits to the idmap. (i.e. VA_BITS may be enough for the idmap, but not hibernate/kexec) Always read TCR_EL1, and check whether any work needs doing for this request. __cpu_uses_extended_idmap() remains as it is used by KVM, whose idmap is also part of the kernel image. This mostly affects the cpuidle path, where we now get an extra system register read . CC: Lorenzo Pieralisi CC: Sudeep Holla Signed-off-by: James Morse Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210125191923.1060122-8-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu_context.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 0b3079fd28eb..70ce8c1d2b07 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -81,16 +81,15 @@ static inline bool __cpu_uses_extended_idmap_level(void) } /* - * Set TCR.T0SZ to its default value (based on VA_BITS) + * Ensure TCR.T0SZ is set to the provided value. */ static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { - unsigned long tcr; + unsigned long tcr = read_sysreg(tcr_el1); - if (!__cpu_uses_extended_idmap()) + if ((tcr & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET == t0sz) return; - tcr = read_sysreg(tcr_el1); tcr &= ~TCR_T0SZ_MASK; tcr |= t0sz << TCR_T0SZ_OFFSET; write_sysreg(tcr, tcr_el1); From 7018d467ff2d6a6d3c820a7ad4e897fe2430b040 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 25 Jan 2021 14:19:13 -0500 Subject: [PATCH 08/13] arm64: trans_pgd: hibernate: idmap the single page that holds the copy page routines To resume from hibernate, the contents of memory are restored from the swap image. This may overwrite any page, including the running kernel and its page tables. Hibernate copies the code it uses to do the restore into a single page that it knows won't be overwritten, and maps it with page tables built from pages that won't be overwritten. Today the address it uses for this mapping is arbitrary, but to allow kexec to reuse this code, it needs to be idmapped. To idmap the page we must avoid the kernel helpers that have VA_BITS baked in. Convert create_single_mapping() to take a single PA, and idmap it. The page tables are built in the reverse order to normal using pfn_pte() to stir in any bits between 52:48. T0SZ is always increased to cover 48bits, or 52 if the copy code has bits 52:48 in its PA. Signed-off-by: James Morse [Adopted the original patch from James to trans_pgd interface, so it can be commonly used by both Kexec and Hibernate. Some minor clean-ups.] Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/linux-arm-kernel/20200115143322.214247-4-james.morse@arm.com/ Link: https://lore.kernel.org/r/20210125191923.1060122-9-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/trans_pgd.h | 3 ++ arch/arm64/kernel/hibernate.c | 32 +++++++------------ arch/arm64/mm/trans_pgd.c | 49 ++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/trans_pgd.h b/arch/arm64/include/asm/trans_pgd.h index 7fbf6a3ccff7..5d08e5adf3d5 100644 --- a/arch/arm64/include/asm/trans_pgd.h +++ b/arch/arm64/include/asm/trans_pgd.h @@ -33,4 +33,7 @@ int trans_pgd_create_copy(struct trans_pgd_info *info, pgd_t **trans_pgd, int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, void *page, unsigned long dst_addr, pgprot_t pgprot); +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page); + #endif /* _ASM_TRANS_TABLE_H */ diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 94fc275cdd21..9df32ba0d574 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -194,7 +194,6 @@ static void *hibernate_page_alloc(void *arg) * page system. */ static int create_safe_exec_page(void *src_start, size_t length, - unsigned long dst_addr, phys_addr_t *phys_dst_addr) { struct trans_pgd_info trans_info = { @@ -203,7 +202,8 @@ static int create_safe_exec_page(void *src_start, size_t length, }; void *page = (void *)get_safe_page(GFP_ATOMIC); - pgd_t *trans_pgd; + phys_addr_t trans_ttbr0; + unsigned long t0sz; int rc; if (!page) @@ -211,13 +211,7 @@ static int create_safe_exec_page(void *src_start, size_t length, memcpy(page, src_start, length); __flush_icache_range((unsigned long)page, (unsigned long)page + length); - - trans_pgd = (void *)get_safe_page(GFP_ATOMIC); - if (!trans_pgd) - return -ENOMEM; - - rc = trans_pgd_map_page(&trans_info, trans_pgd, page, dst_addr, - PAGE_KERNEL_EXEC); + rc = trans_pgd_idmap_page(&trans_info, &trans_ttbr0, &t0sz, page); if (rc) return rc; @@ -230,12 +224,15 @@ static int create_safe_exec_page(void *src_start, size_t length, * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI * runtime services), while for a userspace-driven test_resume cycle it * points to userspace page tables (and we must point it at a zero page - * ourselves). Elsewhere we only (un)install the idmap with preemption - * disabled, so T0SZ should be as required regardless. + * ourselves). + * + * We change T0SZ as part of installing the idmap. This is undone by + * cpu_uninstall_idmap() in __cpu_suspend_exit(). */ cpu_set_reserved_ttbr0(); local_flush_tlb_all(); - write_sysreg(phys_to_ttbr(virt_to_phys(trans_pgd)), ttbr0_el1); + __cpu_set_tcr_t0sz(t0sz); + write_sysreg(trans_ttbr0, ttbr0_el1); isb(); *phys_dst_addr = virt_to_phys(page); @@ -434,7 +431,6 @@ int swsusp_arch_resume(void) void *zero_page; size_t exit_size; pgd_t *tmp_pg_dir; - phys_addr_t phys_hibernate_exit; void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *, void *, phys_addr_t, phys_addr_t); struct trans_pgd_info trans_info = { @@ -462,19 +458,13 @@ int swsusp_arch_resume(void) return -ENOMEM; } - /* - * Locate the exit code in the bottom-but-one page, so that *NULL - * still has disastrous affects. - */ - hibernate_exit = (void *)PAGE_SIZE; exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start; /* * Copy swsusp_arch_suspend_exit() to a safe page. This will generate * a new set of ttbr0 page tables and load them. */ rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size, - (unsigned long)hibernate_exit, - &phys_hibernate_exit); + (phys_addr_t *)&hibernate_exit); if (rc) { pr_err("Failed to create safe executable page for hibernate_exit code.\n"); return rc; @@ -493,7 +483,7 @@ int swsusp_arch_resume(void) * We can skip this step if we booted at EL1, or are running with VHE. */ if (el2_reset_needed()) { - phys_addr_t el2_vectors = phys_hibernate_exit; /* base */ + phys_addr_t el2_vectors = (phys_addr_t)hibernate_exit; el2_vectors += hibernate_el2_vectors - __hibernate_exit_text_start; /* offset */ diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index ded8e2ba0308..527f0a39c3da 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -273,3 +273,52 @@ int trans_pgd_map_page(struct trans_pgd_info *info, pgd_t *trans_pgd, return 0; } + +/* + * The page we want to idmap may be outside the range covered by VA_BITS that + * can be built using the kernel's p?d_populate() helpers. As a one off, for a + * single page, we build these page tables bottom up and just assume that will + * need the maximum T0SZ. + * + * Returns 0 on success, and -ENOMEM on failure. + * On success trans_ttbr0 contains page table with idmapped page, t0sz is set to + * maximum T0SZ for this page. + */ +int trans_pgd_idmap_page(struct trans_pgd_info *info, phys_addr_t *trans_ttbr0, + unsigned long *t0sz, void *page) +{ + phys_addr_t dst_addr = virt_to_phys(page); + unsigned long pfn = __phys_to_pfn(dst_addr); + int max_msb = (dst_addr & GENMASK(52, 48)) ? 51 : 47; + int bits_mapped = PAGE_SHIFT - 4; + unsigned long level_mask, prev_level_entry, *levels[4]; + int this_level, index, level_lsb, level_msb; + + dst_addr &= PAGE_MASK; + prev_level_entry = pte_val(pfn_pte(pfn, PAGE_KERNEL_EXEC)); + + for (this_level = 3; this_level >= 0; this_level--) { + levels[this_level] = trans_alloc(info); + if (!levels[this_level]) + return -ENOMEM; + + level_lsb = ARM64_HW_PGTABLE_LEVEL_SHIFT(this_level); + level_msb = min(level_lsb + bits_mapped, max_msb); + level_mask = GENMASK_ULL(level_msb, level_lsb); + + index = (dst_addr & level_mask) >> level_lsb; + *(levels[this_level] + index) = prev_level_entry; + + pfn = virt_to_pfn(levels[this_level]); + prev_level_entry = pte_val(pfn_pte(pfn, + __pgprot(PMD_TYPE_TABLE))); + + if (level_msb == max_msb) + break; + } + + *trans_ttbr0 = phys_to_ttbr(__pfn_to_phys(pfn)); + *t0sz = TCR_T0SZ(max_msb + 1); + + return 0; +} From 4c3c31230c912d8f2e49c775555aadf79a43d418 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:14 -0500 Subject: [PATCH 09/13] arm64: kexec: move relocation function setup Currently, kernel relocation function is configured in machine_kexec() at the time of kexec reboot by using control_code_page. This operation, however, is more logical to be done during kexec_load, and thus remove from reboot time. Move, setup of this function to newly added machine_kexec_post_load(). Because once MMU is enabled, kexec control page will contain more than relocation kernel, but also vector table, add pointer to the actual function within this page arch.kern_reloc. Currently, it equals to the beginning of page, we will add offsets later, when vector table is added. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-10-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kexec.h | 1 + arch/arm64/kernel/machine_kexec.c | 46 +++++++++++++------------------ 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 61530ec3a9b1..9befcd87e9a8 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -95,6 +95,7 @@ static inline void crash_post_resume(void) {} struct kimage_arch { void *dtb; phys_addr_t dtb_mem; + phys_addr_t kern_reloc; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_mem; diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 8096a6aa1d49..a8aaa6562429 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -42,6 +42,7 @@ static void _kexec_image_info(const char *func, int line, pr_debug(" start: %lx\n", kimage->start); pr_debug(" head: %lx\n", kimage->head); pr_debug(" nr_segments: %lu\n", kimage->nr_segments); + pr_debug(" kern_reloc: %pa\n", &kimage->arch.kern_reloc); for (i = 0; i < kimage->nr_segments; i++) { pr_debug(" segment[%lu]: %016lx - %016lx, 0x%lx bytes, %lu pages\n", @@ -58,6 +59,22 @@ void machine_kexec_cleanup(struct kimage *kimage) /* Empty routine needed to avoid build errors. */ } +int machine_kexec_post_load(struct kimage *kimage) +{ + void *reloc_code = page_to_virt(kimage->control_code_page); + + memcpy(reloc_code, arm64_relocate_new_kernel, + arm64_relocate_new_kernel_size); + kimage->arch.kern_reloc = __pa(reloc_code); + + /* Flush the reloc_code in preparation for its execution. */ + __flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size); + flush_icache_range((uintptr_t)reloc_code, (uintptr_t)reloc_code + + arm64_relocate_new_kernel_size); + + return 0; +} + /** * machine_kexec_prepare - Prepare for a kexec reboot. * @@ -143,8 +160,6 @@ static void kexec_segment_flush(const struct kimage *kimage) */ void machine_kexec(struct kimage *kimage) { - phys_addr_t reboot_code_buffer_phys; - void *reboot_code_buffer; bool in_kexec_crash = (kimage == kexec_crash_image); bool stuck_cpus = cpus_are_stuck_in_kernel(); @@ -155,31 +170,8 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); - reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); - kexec_image_info(kimage); - /* - * Copy arm64_relocate_new_kernel to the reboot_code_buffer for use - * after the kernel is shut down. - */ - memcpy(reboot_code_buffer, arm64_relocate_new_kernel, - arm64_relocate_new_kernel_size); - - /* Flush the reboot_code_buffer in preparation for its execution. */ - __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); - - /* - * Although we've killed off the secondary CPUs, we don't update - * the online mask if we're handling a crash kernel and consequently - * need to avoid flush_icache_range(), which will attempt to IPI - * the offline CPUs. Therefore, we must use the __* variant here. - */ - __flush_icache_range((uintptr_t)reboot_code_buffer, - (uintptr_t)reboot_code_buffer + - arm64_relocate_new_kernel_size); - /* Flush the kimage list and its buffers. */ kexec_list_flush(kimage); @@ -193,7 +185,7 @@ void machine_kexec(struct kimage *kimage) /* * cpu_soft_restart will shutdown the MMU, disable data caches, then - * transfer control to the reboot_code_buffer which contains a copy of + * transfer control to the kern_reloc which contains a copy of * the arm64_relocate_new_kernel routine. arm64_relocate_new_kernel * uses physical addressing to relocate the new image to its final * position and transfers control to the image entry point when the @@ -203,7 +195,7 @@ void machine_kexec(struct kimage *kimage) * userspace (kexec-tools). * In kexec_file case, the kernel starts directly without purgatory. */ - cpu_soft_restart(reboot_code_buffer_phys, kimage->head, kimage->start, + cpu_soft_restart(kimage->arch.kern_reloc, kimage->head, kimage->start, kimage->arch.dtb_mem); BUG(); /* Should never get here. */ From 77a43be1164852be4b761e4acaf651f986c4e8d7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:15 -0500 Subject: [PATCH 10/13] arm64: kexec: call kexec_image_info only once Currently, kexec_image_info() is called during load time, and right before kernel is being kexec'ed. There is no need to do both. So, call it only once when segments are loaded and the physical location of page with copy of arm64_relocate_new_kernel is known. Signed-off-by: Pavel Tatashin Acked-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-11-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/machine_kexec.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a8aaa6562429..90a335c74442 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -66,6 +66,7 @@ int machine_kexec_post_load(struct kimage *kimage) memcpy(reloc_code, arm64_relocate_new_kernel, arm64_relocate_new_kernel_size); kimage->arch.kern_reloc = __pa(reloc_code); + kexec_image_info(kimage); /* Flush the reloc_code in preparation for its execution. */ __flush_dcache_area(reloc_code, arm64_relocate_new_kernel_size); @@ -84,8 +85,6 @@ int machine_kexec_post_load(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kexec_image_info(kimage); - if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { pr_err("Can't kexec: CPUs are stuck in the kernel.\n"); return -EBUSY; @@ -170,8 +169,6 @@ void machine_kexec(struct kimage *kimage) WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), "Some CPUs may be stale, kdump will be unreliable.\n"); - kexec_image_info(kimage); - /* Flush the kimage list and its buffers. */ kexec_list_flush(kimage); From dbd82fee0f258739272349bc87f5841fc1fb982a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:16 -0500 Subject: [PATCH 11/13] arm64: kexec: arm64_relocate_new_kernel clean-ups and optimizations In preparation to bigger changes to arm64_relocate_new_kernel that would enable this function to do MMU backed memory copy, do few clean-ups and optimizations. These include: 1. Call raw_dcache_line_size() only when relocation is actually going to happen. i.e. kdump type kexec, does not need it. 2. copy_page(dest, src, tmps...) increments dest and src by PAGE_SIZE, so no need to store dest prior to calling copy_page and increment it after. Also, src is not used after a copy, not need to copy either. 3. For consistency use comment on the same line with instruction when it describes the instruction itself. 4. Some comment corrections Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210125191923.1060122-12-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/relocate_kernel.S | 36 +++++++---------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 84eec95ec06c..462ffbc37071 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -17,28 +17,24 @@ /* * arm64_relocate_new_kernel - Put a 2nd stage image in place and boot it. * - * The memory that the old kernel occupies may be overwritten when coping the + * The memory that the old kernel occupies may be overwritten when copying the * new image to its final location. To assure that the * arm64_relocate_new_kernel routine which does that copy is not overwritten, * all code and data needed by arm64_relocate_new_kernel must be between the * symbols arm64_relocate_new_kernel and arm64_relocate_new_kernel_end. The * machine_kexec() routine will copy arm64_relocate_new_kernel to the kexec - * control_code_page, a special page which has been set up to be preserved - * during the copy operation. + * safe memory that has been set up to be preserved during the copy operation. */ SYM_CODE_START(arm64_relocate_new_kernel) - /* Setup the list loop variables. */ mov x18, x2 /* x18 = dtb address */ mov x17, x1 /* x17 = kimage_start */ mov x16, x0 /* x16 = kimage_head */ - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ mov x14, xzr /* x14 = entry ptr */ mov x13, xzr /* x13 = copy dest */ - /* Check if the new image needs relocation. */ tbnz x16, IND_DONE_BIT, .Ldone - + raw_dcache_line_size x15, x0 /* x15 = dcache line size */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ @@ -57,34 +53,18 @@ SYM_CODE_START(arm64_relocate_new_kernel) b.lo 2b dsb sy - mov x20, x13 - mov x21, x12 - copy_page x20, x21, x0, x1, x2, x3, x4, x5, x6, x7 - - /* dest += PAGE_SIZE */ - add x13, x13, PAGE_SIZE + copy_page x13, x12, x0, x1, x2, x3, x4, x5, x6, x7 b .Lnext - .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination - - /* ptr = addr */ - mov x14, x12 + mov x14, x12 /* ptr = addr */ b .Lnext - .Ltest_destination: tbz x16, IND_DESTINATION_BIT, .Lnext - - /* dest = addr */ - mov x13, x12 - + mov x13, x12 /* dest = addr */ .Lnext: - /* entry = *ptr++ */ - ldr x16, [x14], #8 - - /* while (!(entry & DONE)) */ - tbz x16, IND_DONE_BIT, .Lloop - + ldr x16, [x14], #8 /* entry = *ptr++ */ + tbz x16, IND_DONE_BIT, .Lloop /* while (!(entry & DONE)) */ .Ldone: /* wait for writes from copy_page to finish */ dsb nsh From a360190e8a42d47ea80355f286939ba82b02405a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 25 Jan 2021 14:19:17 -0500 Subject: [PATCH 12/13] arm64: kexec: arm64_relocate_new_kernel don't use x0 as temp x0 will contain the only argument to arm64_relocate_new_kernel; don't use it as a temp. Reassigned registers to free-up x0 so we won't need to copy argument, and can use it at the beginning and at the end of the function. Signed-off-by: Pavel Tatashin Reviewed-by: James Morse Link: https://lore.kernel.org/r/20210125191923.1060122-13-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/relocate_kernel.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 462ffbc37071..b78ea5de97a4 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -34,7 +34,7 @@ SYM_CODE_START(arm64_relocate_new_kernel) mov x13, xzr /* x13 = copy dest */ /* Check if the new image needs relocation. */ tbnz x16, IND_DONE_BIT, .Ldone - raw_dcache_line_size x15, x0 /* x15 = dcache line size */ + raw_dcache_line_size x15, x1 /* x15 = dcache line size */ .Lloop: and x12, x16, PAGE_MASK /* x12 = addr */ @@ -43,17 +43,17 @@ SYM_CODE_START(arm64_relocate_new_kernel) tbz x16, IND_SOURCE_BIT, .Ltest_indirection /* Invalidate dest page to PoC. */ - mov x0, x13 - add x20, x0, #PAGE_SIZE + mov x2, x13 + add x20, x2, #PAGE_SIZE sub x1, x15, #1 - bic x0, x0, x1 -2: dc ivac, x0 - add x0, x0, x15 - cmp x0, x20 + bic x2, x2, x1 +2: dc ivac, x2 + add x2, x2, x15 + cmp x2, x20 b.lo 2b dsb sy - copy_page x13, x12, x0, x1, x2, x3, x4, x5, x6, x7 + copy_page x13, x12, x1, x2, x3, x4, x5, x6, x7, x8 b .Lnext .Ltest_indirection: tbz x16, IND_INDIRECTION_BIT, .Ltest_destination From d1bbc35fcab28668c8992c4d5777234b794d7306 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 1 Feb 2021 10:03:06 -0500 Subject: [PATCH 13/13] arm64: hibernate: add __force attribute to gfp_t casting Two new warnings are reported by sparse: "sparse warnings: (new ones prefixed by >>)" >> arch/arm64/kernel/hibernate.c:181:39: sparse: sparse: cast to restricted gfp_t >> arch/arm64/kernel/hibernate.c:202:44: sparse: sparse: cast from restricted gfp_t gfp_t has __bitwise type attribute and requires __force added to casting in order to avoid these warnings. Fixes: 50f53fb72181 ("arm64: trans_pgd: make trans_pgd_map_page generic") Reported-by: kernel test robot Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/r/20210201150306.54099-2-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 9df32ba0d574..b1cef371df2b 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -178,7 +178,7 @@ EXPORT_SYMBOL(arch_hibernation_header_restore); static void *hibernate_page_alloc(void *arg) { - return (void *)get_safe_page((gfp_t)(unsigned long)arg); + return (void *)get_safe_page((__force gfp_t)(unsigned long)arg); } /* @@ -198,7 +198,7 @@ static int create_safe_exec_page(void *src_start, size_t length, { struct trans_pgd_info trans_info = { .trans_alloc_page = hibernate_page_alloc, - .trans_alloc_arg = (void *)GFP_ATOMIC, + .trans_alloc_arg = (__force void *)GFP_ATOMIC, }; void *page = (void *)get_safe_page(GFP_ATOMIC);