linux/arch/x86/include/asm/page_64.h
Ankur Arora cb431accb3 x86/clear_page: introduce clear_pages()
Performance when clearing with string instructions (x86-64-stosq and
similar) can vary significantly based on the chunk-size used.

  $ perf bench mem memset -k 4KB -s 4GB -f x86-64-stosq
  # Running 'mem/memset' benchmark:
  # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S)
  # Copying 4GB bytes ...

      13.748208 GB/sec

  $ perf bench mem memset -k 2MB -s 4GB -f x86-64-stosq
  # Running 'mem/memset' benchmark:
  # function 'x86-64-stosq' (movsq-based memset() in
  # arch/x86/lib/memset_64.S)
  # Copying 4GB bytes ...

      15.067900 GB/sec

  $ perf bench mem memset -k 1GB -s 4GB -f x86-64-stosq
  # Running 'mem/memset' benchmark:
  # function 'x86-64-stosq' (movsq-based memset() in arch/x86/lib/memset_64.S)
  # Copying 4GB bytes ...

      38.104311 GB/sec

(Both on AMD Milan.)

With a change in chunk-size from 4KB to 1GB, we see the performance go
from 13.7 GB/sec to 38.1 GB/sec.  For the chunk-size of 2MB the change
isn't quite as drastic but it is worth adding a clear_page() variant that
can handle contiguous page-extents.

Link: https://lkml.kernel.org/r/20260107072009.1615991-6-ankur.a.arora@oracle.com
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konrad Rzessutek Wilk <konrad.wilk@oracle.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2026-01-20 19:24:40 -08:00

158 lines
4.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H
#include <asm/page_64_types.h>
#ifndef __ASSEMBLER__
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <linux/kmsan-checks.h>
#include <linux/mmdebug.h>
/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
extern unsigned long direct_map_physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* use the carry flag to determine if x was < __START_KERNEL_map */
x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
return x;
}
#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
#endif
static inline unsigned long __phys_addr_symbol(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* only check upper bounds since lower bounds will trigger carry */
VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
return y + phys_base;
}
#define __phys_reloc_hide(x) (x)
void __clear_pages_unrolled(void *page);
KCFI_REFERENCE(__clear_pages_unrolled);
/**
* clear_pages() - clear a page range using a kernel virtual address.
* @addr: start address of kernel page range
* @npages: number of pages
*
* Switch between three implementations of page clearing based on CPU
* capabilities:
*
* - __clear_pages_unrolled(): the oldest, slowest and universally
* supported method. Zeroes via 8-byte MOV instructions unrolled 8x
* to write a 64-byte cacheline in each loop iteration.
*
* - "REP; STOSQ": really old CPUs had crummy REP implementations.
* Vendor CPU setup code sets 'REP_GOOD' on CPUs where REP can be
* trusted. The instruction writes 8-byte per REP iteration but
* CPUs can internally batch these together and do larger writes.
*
* - "REP; STOSB": used on CPUs with "enhanced REP MOVSB/STOSB",
* which enumerate 'ERMS' and provide an implementation which
* unlike "REP; STOSQ" above wasn't overly picky about alignment.
* The instruction writes 1-byte per REP iteration with CPUs
* internally batching these together into larger writes and is
* generally fastest of the three.
*
* Note that when running as a guest, features exposed by the CPU
* might be mediated by the hypervisor. So, the STOSQ variant might
* be in active use on some systems even when the hardware enumerates
* ERMS.
*
* Does absolutely no exception handling.
*/
static inline void clear_pages(void *addr, unsigned int npages)
{
u64 len = npages * PAGE_SIZE;
/*
* Clean up KMSAN metadata for the pages being cleared. The assembly call
* below clobbers @addr, so perform unpoisoning before it.
*/
kmsan_unpoison_memory(addr, len);
/*
* The inline asm embeds a CALL instruction and usually that is a no-no
* due to the compiler not knowing that and thus being unable to track
* callee-clobbered registers.
*
* In this case that is fine because the registers clobbered by
* __clear_pages_unrolled() are part of the inline asm register
* specification.
*/
asm volatile(ALTERNATIVE_2("call __clear_pages_unrolled",
"shrq $3, %%rcx; rep stosq", X86_FEATURE_REP_GOOD,
"rep stosb", X86_FEATURE_ERMS)
: "+c" (len), "+D" (addr), ASM_CALL_CONSTRAINT
: "a" (0)
: "cc", "memory");
}
#define clear_pages clear_pages
static inline void clear_page(void *addr)
{
clear_pages(addr, 1);
}
void copy_page(void *to, void *from);
KCFI_REFERENCE(copy_page);
/*
* User space process size. This is the first address outside the user range.
* There are a few constraints that determine this:
*
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
static __always_inline unsigned long task_size_max(void)
{
unsigned long ret;
alternative_io("movq %[small],%0","movq %[large],%0",
X86_FEATURE_LA57,
"=r" (ret),
[small] "i" ((1ul << 47)-PAGE_SIZE),
[large] "i" ((1ul << 56)-PAGE_SIZE));
return ret;
}
#endif /* !__ASSEMBLER__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif
#endif /* _ASM_X86_PAGE_64_H */