linux/include/trace/events/huge_memory.h
Vernon Yang 0562041977 mm: khugepaged: skip lazy-free folios
For example, create three task: hot1 -> cold -> hot2.  After all three
task are created, each allocate memory 128MB.  the hot1/hot2 task
continuously access 128 MB memory, while the cold task only accesses its
memory briefly and then call madvise(MADV_FREE).  However, khugepaged
still prioritizes scanning the cold task and only scans the hot2 task
after completing the scan of the cold task.

All folios in VM_DROPPABLE are lazyfree, Collapsing maintains that
property, so we can just collapse and memory pressure in the future will
free it up.  In contrast, collapsing in !VM_DROPPABLE does not maintain
that property, the collapsed folio will not be lazyfree and memory
pressure in the future will not be able to free it up.

So if the user has explicitly informed us via MADV_FREE that this memory
will be freed, and this vma does not have VM_DROPPABLE flags, it is
appropriate for khugepaged to skip it only, thereby avoiding unnecessary
scan and collapse operations to reducing CPU wastage.

Here are the performance test results:
(Throughput bigger is better, other smaller is better)

Testing on x86_64 machine:

| task hot2           | without patch | with patch    |  delta  |
|---------------------|---------------|---------------|---------|
| total accesses time |  3.14 sec     |  2.93 sec     | -6.69%  |
| cycles per access   |  4.96         |  2.21         | -55.44% |
| Throughput          |  104.38 M/sec |  111.89 M/sec | +7.19%  |
| dTLB-load-misses    |  284814532    |  69597236     | -75.56% |

Testing on qemu-system-x86_64 -enable-kvm:

| task hot2           | without patch | with patch    |  delta  |
|---------------------|---------------|---------------|---------|
| total accesses time |  3.35 sec     |  2.96 sec     | -11.64% |
| cycles per access   |  7.29         |  2.07         | -71.60% |
| Throughput          |  97.67 M/sec  |  110.77 M/sec | +13.41% |
| dTLB-load-misses    |  241600871    |  3216108      | -98.67% |

[vernon2gm@gmail.com: add comment about VM_DROPPABLE in code, make it clearer]
  Link: https://lkml.kernel.org/r/i4uowkt4h2ev47obm5h2vtd4zbk6fyw5g364up7kkjn2vmcikq@auepvqethj5r
Link: https://lkml.kernel.org/r/20260221093918.1456187-5-vernon2gm@gmail.com
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
Acked-by: David Hildenbrand (arm) <david@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2026-04-05 13:53:03 -07:00

268 lines
6.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM huge_memory
#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
#define __HUGE_MEMORY_H
#include <linux/tracepoint.h>
#define SCAN_STATUS \
EM( SCAN_FAIL, "failed") \
EM( SCAN_SUCCEED, "succeeded") \
EM( SCAN_NO_PTE_TABLE, "no_pte_table") \
EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \
EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \
EM( SCAN_PAGE_NULL, "page_null") \
EM( SCAN_SCAN_ABORT, "scan_aborted") \
EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
EM( SCAN_PAGE_LRU, "page_not_in_lru") \
EM( SCAN_PAGE_LOCK, "page_locked") \
EM( SCAN_PAGE_ANON, "page_not_anon") \
EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \
EM( SCAN_PAGE_COMPOUND, "page_compound") \
EM( SCAN_ANY_PROCESS, "no_process_for_page") \
EM( SCAN_VMA_NULL, "vma_null") \
EM( SCAN_VMA_CHECK, "vma_check_failed") \
EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
EM( SCAN_TRUNCATED, "truncated") \
EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
EM( SCAN_STORE_FAILED, "store_failed") \
EM( SCAN_COPY_MC, "copy_poisoned_page") \
EM( SCAN_PAGE_FILLED, "page_filled") \
EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
#undef EM
#undef EMe
#define EM(a, b) TRACE_DEFINE_ENUM(a);
#define EMe(a, b) TRACE_DEFINE_ENUM(a);
SCAN_STATUS
#undef EM
#undef EMe
#define EM(a, b) {a, b},
#define EMe(a, b) {a, b}
TRACE_EVENT(mm_khugepaged_scan_pmd,
TP_PROTO(struct mm_struct *mm, struct folio *folio,
int referenced, int none_or_zero, int status, int unmapped),
TP_ARGS(mm, folio, referenced, none_or_zero, status, unmapped),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(unsigned long, pfn)
__field(int, referenced)
__field(int, none_or_zero)
__field(int, status)
__field(int, unmapped)
),
TP_fast_assign(
__entry->mm = mm;
__entry->pfn = folio ? folio_pfn(folio) : -1;
__entry->referenced = referenced;
__entry->none_or_zero = none_or_zero;
__entry->status = status;
__entry->unmapped = unmapped;
),
TP_printk("mm=%p, scan_pfn=0x%lx, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
__entry->mm,
__entry->pfn,
__entry->referenced,
__entry->none_or_zero,
__print_symbolic(__entry->status, SCAN_STATUS),
__entry->unmapped)
);
TRACE_EVENT(mm_collapse_huge_page,
TP_PROTO(struct mm_struct *mm, int isolated, int status),
TP_ARGS(mm, isolated, status),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, isolated)
__field(int, status)
),
TP_fast_assign(
__entry->mm = mm;
__entry->isolated = isolated;
__entry->status = status;
),
TP_printk("mm=%p, isolated=%d, status=%s",
__entry->mm,
__entry->isolated,
__print_symbolic(__entry->status, SCAN_STATUS))
);
TRACE_EVENT(mm_collapse_huge_page_isolate,
TP_PROTO(struct folio *folio, int none_or_zero,
int referenced, int status),
TP_ARGS(folio, none_or_zero, referenced, status),
TP_STRUCT__entry(
__field(unsigned long, pfn)
__field(int, none_or_zero)
__field(int, referenced)
__field(int, status)
),
TP_fast_assign(
__entry->pfn = folio ? folio_pfn(folio) : -1;
__entry->none_or_zero = none_or_zero;
__entry->referenced = referenced;
__entry->status = status;
),
TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
__entry->pfn,
__entry->none_or_zero,
__entry->referenced,
__print_symbolic(__entry->status, SCAN_STATUS))
);
TRACE_EVENT(mm_collapse_huge_page_swapin,
TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
TP_ARGS(mm, swapped_in, referenced, ret),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, swapped_in)
__field(int, referenced)
__field(int, ret)
),
TP_fast_assign(
__entry->mm = mm;
__entry->swapped_in = swapped_in;
__entry->referenced = referenced;
__entry->ret = ret;
),
TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
__entry->mm,
__entry->swapped_in,
__entry->referenced,
__entry->ret)
);
TRACE_EVENT(mm_khugepaged_scan_file,
TP_PROTO(struct mm_struct *mm, struct folio *folio, struct file *file,
int present, int swap, int result),
TP_ARGS(mm, folio, file, present, swap, result),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(unsigned long, pfn)
__string(filename, file->f_path.dentry->d_iname)
__field(int, present)
__field(int, swap)
__field(int, result)
),
TP_fast_assign(
__entry->mm = mm;
__entry->pfn = folio ? folio_pfn(folio) : -1;
__assign_str(filename);
__entry->present = present;
__entry->swap = swap;
__entry->result = result;
),
TP_printk("mm=%p, scan_pfn=0x%lx, filename=%s, present=%d, swap=%d, result=%s",
__entry->mm,
__entry->pfn,
__get_str(filename),
__entry->present,
__entry->swap,
__print_symbolic(__entry->result, SCAN_STATUS))
);
TRACE_EVENT(mm_khugepaged_collapse_file,
TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
unsigned long addr, bool is_shmem, struct file *file,
int nr, int result),
TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(unsigned long, hpfn)
__field(pgoff_t, index)
__field(unsigned long, addr)
__field(bool, is_shmem)
__string(filename, file->f_path.dentry->d_iname)
__field(int, nr)
__field(int, result)
),
TP_fast_assign(
__entry->mm = mm;
__entry->hpfn = new_folio ? folio_pfn(new_folio) : -1;
__entry->index = index;
__entry->addr = addr;
__entry->is_shmem = is_shmem;
__assign_str(filename);
__entry->nr = nr;
__entry->result = result;
),
TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
__entry->mm,
__entry->hpfn,
__entry->index,
__entry->addr,
__entry->is_shmem,
__get_str(filename),
__entry->nr,
__print_symbolic(__entry->result, SCAN_STATUS))
);
TRACE_EVENT(mm_khugepaged_scan,
TP_PROTO(struct mm_struct *mm, unsigned int progress,
bool full_scan_finished),
TP_ARGS(mm, progress, full_scan_finished),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(unsigned int, progress)
__field(bool, full_scan_finished)
),
TP_fast_assign(
__entry->mm = mm;
__entry->progress = progress;
__entry->full_scan_finished = full_scan_finished;
),
TP_printk("mm=%p, progress=%u, full_scan_finished=%d",
__entry->mm,
__entry->progress,
__entry->full_scan_finished)
);
#endif /* __HUGE_MEMORY_H */
#include <trace/define_trace.h>