diff --git a/include/linux/mm.h b/include/linux/mm.h index f9d07145d1af..ed171ec53a0a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include #include struct mempolicy; @@ -1279,6 +1280,7 @@ static inline void put_page(struct page *page) */ #define GUP_PIN_COUNTING_BIAS (1U << 10) +void put_user_page(struct page *page); void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index cfce186f0c4e..ce55d8f6bc27 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,6 +19,10 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, +#if defined(CONFIG_PAGE_PINNER) + /* page refcount was increased by GUP or follow_page(FOLL_GET) */ + PAGE_EXT_GET, +#endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, diff --git a/include/linux/page_pinner.h b/include/linux/page_pinner.h new file mode 100644 index 000000000000..013e3bcb40e8 --- /dev/null +++ b/include/linux/page_pinner.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_PAGE_PINNER_H +#define __LINUX_PAGE_PINNER_H + +#include + +#ifdef CONFIG_PAGE_PINNER +extern struct static_key_false page_pinner_inited; +extern struct page_ext_operations page_pinner_ops; + +extern void __reset_page_pinner(struct page *page, unsigned int order, bool free); +extern void __set_page_pinner(struct page *page, unsigned int order); +extern void __dump_page_pinner(struct page *page); + +static inline void reset_page_pinner(struct page *page, unsigned int order) +{ + if (static_branch_unlikely(&page_pinner_inited)) + __reset_page_pinner(page, order, false); +} + +static inline void free_page_pinner(struct page *page, unsigned int order) +{ + if (static_branch_unlikely(&page_pinner_inited)) + __reset_page_pinner(page, order, true); +} + +static inline void set_page_pinner(struct page *page, unsigned int order) +{ + if (static_branch_unlikely(&page_pinner_inited)) + __set_page_pinner(page, order); +} + +static inline void dump_page_pinner(struct page *page) +{ + if (static_branch_unlikely(&page_pinner_inited)) + __dump_page_pinner(page); +} +#else +static inline void reset_page_pinner(struct page *page, unsigned int order) +{ +} +static inline void free_page_pinner(struct page *page, unsigned int order) +{ +} +static inline void set_page_pinner(struct page *page, unsigned int order) +{ +} +static inline void dump_page_pinner(struct page *page) +{ +} +#endif /* CONFIG_PAGE_PINNER */ +#endif /* __LINUX_PAGE_PINNER_H */ diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..0c33018c1609 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,22 @@ config PAGE_OWNER If unsure, say N. +config PAGE_PINNER + bool "Track page pinner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the pinner of a page, may + help to find page migration failures. Even if you include this + feature in your build, it is disabled by default. You should pass + "page_pinner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. + + If unsure, say N. + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index 7944ba75399c..8de8651da069 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o +obj-$(CONFIG_PAGE_PINNER) += page_pinner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o diff --git a/mm/debug.c b/mm/debug.c index ccca576b2899..aa44dea5276f 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "internal.h" @@ -191,6 +192,7 @@ void dump_page(struct page *page, const char *reason) { __dump_page(page, reason); dump_page_owner(page); + dump_page_pinner(page); } EXPORT_SYMBOL(dump_page); diff --git a/mm/gup.c b/mm/gup.c index 054ff923d3d9..8cbd1dad595e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -18,6 +18,8 @@ #include #include +#include + #include #include @@ -82,9 +84,12 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags) { - if (flags & FOLL_GET) - return try_get_compound_head(page, refs); - else if (flags & FOLL_PIN) { + if (flags & FOLL_GET) { + struct page *head = try_get_compound_head(page, refs); + if (head) + set_page_pinner(head, compound_order(head)); + return head; + } else if (flags & FOLL_PIN) { int orig_refs = refs; /* @@ -142,6 +147,9 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ if (refs > 1) page_ref_sub(page, refs - 1); + + if (flags & FOLL_GET) + reset_page_pinner(page, compound_order(page)); put_page(page); } @@ -170,9 +178,15 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) { WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); - if (flags & FOLL_GET) - return try_get_page(page); - else if (flags & FOLL_PIN) { + if (flags & FOLL_GET) { + bool ret = try_get_page(page); + + if (ret) { + page = compound_head(page); + set_page_pinner(page, compound_order(page)); + } + return ret; + } else if (flags & FOLL_PIN) { int refs = 1; page = compound_head(page); @@ -214,6 +228,24 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +/* + * put_user_page() - release a page obtained using get_user_pages() or + * follow_page(FOLL_GET) + * @page: pointer to page to be released + * + * Pages that were obtained via get_user_pages()/follow_page(FOLL_GET) must be + * released via put_user_page. + * note: If it's not a page from GUP or follow_page(FOLL_GET), it's harmless. + */ +void put_user_page(struct page *page) +{ + struct page *head = compound_head(page); + + reset_page_pinner(head, compound_order(head)); + put_page(page); +} +EXPORT_SYMBOL(put_user_page); + /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a276c3a8541d..bbe03f36406c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -1285,6 +1286,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + free_page_pinner(page, order); return false; } @@ -1322,6 +1324,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + free_page_pinner(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), diff --git a/mm/page_ext.c b/mm/page_ext.c index de36d5f6ca71..7e44726b3549 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -65,6 +65,9 @@ static struct page_ext_operations *page_ext_ops[] = { #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_PINNER + &page_pinner_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); diff --git a/mm/page_pinner.c b/mm/page_pinner.c new file mode 100644 index 000000000000..f48a3d2554ef --- /dev/null +++ b/mm/page_pinner.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define PAGE_PINNER_STACK_DEPTH 16 +#define LONTERM_PIN_BUCKETS 4096 + +struct page_pinner { + depot_stack_handle_t handle; + s64 ts_usec; + atomic_t count; +}; + +struct captured_pinner { + depot_stack_handle_t handle; + s64 ts_usec; + int page_mt; + unsigned long page_flags; + unsigned long pfn; +}; + +struct longterm_pinner { + spinlock_t lock; + unsigned int index; + struct captured_pinner pinner[LONTERM_PIN_BUCKETS]; +}; + +static struct longterm_pinner lt_pinner = { + .lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock), +}; + +static s64 threshold_usec = 300000; + +static bool page_pinner_enabled; +DEFINE_STATIC_KEY_FALSE(page_pinner_inited); + +static depot_stack_handle_t failure_handle; + +static int __init early_page_pinner_param(char *buf) +{ + page_pinner_enabled = true; + return 0; +} +early_param("page_pinner", early_page_pinner_param); + +static bool need_page_pinner(void) +{ + return page_pinner_enabled; +} + +static noinline void register_failure_stack(void) +{ + unsigned long entries[4]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL); +} + +static void init_page_pinner(void) +{ + if (!page_pinner_enabled) + return; + + register_failure_stack(); + static_branch_enable(&page_pinner_inited); +} + +struct page_ext_operations page_pinner_ops = { + .size = sizeof(struct page_pinner), + .need = need_page_pinner, + .init = init_page_pinner, +}; + +static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_pinner_ops.offset; +} + +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_PINNER_STACK_DEPTH]; + depot_stack_handle_t handle; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); + handle = stack_depot_save(entries, nr_entries, flags); + if (!handle) + handle = failure_handle; + + return handle; +} + +static void check_lonterm_pin(struct page_pinner *page_pinner, + struct page *page) +{ + s64 now, delta = 0; + unsigned long flags; + unsigned int idx; + + now = ktime_to_us(ktime_get_boottime()); + + /* get/put_page can be raced. Ignore that case */ + if (page_pinner->ts_usec < now) + delta = now - page_pinner->ts_usec; + + if (delta <= threshold_usec) + return; + + spin_lock_irqsave(<_pinner.lock, flags); + idx = lt_pinner.index++; + lt_pinner.index %= LONTERM_PIN_BUCKETS; + + lt_pinner.pinner[idx].handle = page_pinner->handle; + lt_pinner.pinner[idx].ts_usec = delta; + lt_pinner.pinner[idx].page_flags = page->flags; + lt_pinner.pinner[idx].page_mt = get_pageblock_migratetype(page); + lt_pinner.pinner[idx].pfn = page_to_pfn(page); + spin_unlock_irqrestore(<_pinner.lock, flags); + +} + +void __reset_page_pinner(struct page *page, unsigned int order, bool free) +{ + struct page_pinner *page_pinner; + struct page_ext *page_ext; + int i; + + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + + for (i = 0; i < (1 << order); i++) { + if (!test_bit(PAGE_EXT_GET, &page_ext->flags)) + continue; + + page_pinner = get_page_pinner(page_ext); + if (free) { + WARN_ON_ONCE(atomic_read(&page_pinner->count)); + atomic_set(&page_pinner->count, 0); + } else { + WARN_ON_ONCE(atomic_dec_if_positive( + &page_pinner->count) < 0); + check_lonterm_pin(page_pinner, page); + } + clear_bit(PAGE_EXT_GET, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __set_page_pinner_handle(struct page *page, + struct page_ext *page_ext, depot_stack_handle_t handle, + unsigned int order) +{ + struct page_pinner *page_pinner; + int i; + s64 usec = ktime_to_us(ktime_get_boottime()); + + for (i = 0; i < (1 << order); i++) { + page_pinner = get_page_pinner(page_ext); + page_pinner->handle = handle; + page_pinner->ts_usec = usec; + set_bit(PAGE_EXT_GET, &page_ext->flags); + atomic_inc(&page_pinner->count); + page_ext = page_ext_next(page_ext); + } +} + +noinline void __set_page_pinner(struct page *page, unsigned int order) +{ + struct page_ext *page_ext = lookup_page_ext(page); + depot_stack_handle_t handle; + + if (unlikely(!page_ext)) + return; + + handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + __set_page_pinner_handle(page, page_ext, handle, order); +} + +static ssize_t +print_page_pinner(char __user *buf, size_t count, unsigned long pfn, + int pageblock_mt, unsigned long page_flags, s64 ts_usec, + depot_stack_handle_t handle, int shared_count) +{ + int ret; + unsigned long *entries; + unsigned int nr_entries; + char *kbuf; + + count = min_t(size_t, count, PAGE_SIZE); + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = snprintf(kbuf, count, + "Page pinned ts %lld us count %d\n", + ts_usec, shared_count); + + if (ret >= count) + goto err; + + /* Print information relevant to grouping pages by mobility */ + ret += snprintf(kbuf + ret, count - ret, + "PFN %lu Block %lu type %s Flags %#lx(%pGp)\n", + pfn, + pfn >> pageblock_order, + migratetype_names[pageblock_mt], + page_flags, &page_flags); + + if (ret >= count) + goto err; + + nr_entries = stack_depot_fetch(handle, &entries); + ret += stack_trace_snprint(kbuf + ret, count - ret, entries, + nr_entries, 0); + if (ret >= count) + goto err; + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +void __dump_page_pinner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct page_pinner *page_pinner; + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; + int pageblock_mt; + unsigned long pfn; + int count; + + if (unlikely(!page_ext)) { + pr_alert("There is not page extension available.\n"); + return; + } + + page_pinner = get_page_pinner(page_ext); + + count = atomic_read(&page_pinner->count); + if (!count) { + pr_alert("page_pinner info is not present (never set?)\n"); + return; + } + + pfn = page_to_pfn(page); + pr_alert("page last pinned ts %lld count %d\n", + page_pinner->ts_usec, + count); + + pageblock_mt = get_pageblock_migratetype(page); + pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n", + pfn, + pfn >> pageblock_order, + migratetype_names[pageblock_mt], + page->flags, &page->flags); + + handle = READ_ONCE(page_pinner->handle); + if (!handle) { + pr_alert("page_pinner allocation stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + stack_trace_print(entries, nr_entries, 0); + } +} + +static ssize_t +read_longterm_page_pinner(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + loff_t i, idx; + struct captured_pinner record; + unsigned long flags; + + if (!static_branch_unlikely(&page_pinner_inited)) + return -EINVAL; + + if (*ppos >= LONTERM_PIN_BUCKETS) + return 0; + + i = *ppos; + *ppos = i + 1; + + /* + * reading the records in the reverse order with newest one + * being read first followed by older ones + */ + idx = (lt_pinner.index - 1 - i + LONTERM_PIN_BUCKETS) % + LONTERM_PIN_BUCKETS; + spin_lock_irqsave(<_pinner.lock, flags); + record = lt_pinner.pinner[idx]; + spin_unlock_irqrestore(<_pinner.lock, flags); + if (!record.handle) + return 0; + + return print_page_pinner(buf, count, record.pfn, record.page_mt, + record.page_flags, record.ts_usec, + record.handle, 0); +} + +static const struct file_operations proc_longterm_pinner_operations = { + .read = read_longterm_page_pinner, +}; + +static int pp_threshold_set(void *data, unsigned long long val) +{ + unsigned long flags; + + threshold_usec = (s64)val; + + spin_lock_irqsave(<_pinner.lock, flags); + memset(lt_pinner.pinner, 0, + sizeof(struct captured_pinner) * LONTERM_PIN_BUCKETS); + lt_pinner.index = 0; + spin_unlock_irqrestore(<_pinner.lock, flags); + return 0; +} + +static int pp_threshold_get(void *data, unsigned long long *val) +{ + *val = (unsigned long long)threshold_usec; + + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get, + pp_threshold_set, "%lld\n"); + +static int __init page_pinner_init(void) +{ + struct dentry *pp_debugfs_root; + + if (!static_branch_unlikely(&page_pinner_inited)) + return 0; + + pr_info("page_pinner enabled\n"); + pp_debugfs_root = debugfs_create_dir("page_pinner", NULL); + + debugfs_create_file("longterm_pinner", 0400, pp_debugfs_root, NULL, + &proc_longterm_pinner_operations); + + debugfs_create_file("threshold", 0444, pp_debugfs_root, NULL, + &pp_threshold_fops); + return 0; +} +late_initcall(page_pinner_init)