ANDROID: mm: introduce page_pinner

For CMA allocation, it's really critical to migrate a page but sometimes it fails. One of the reasons is some driver holds a page refcount for a long time so VM couldn't migrate the page at that time. The concern here is there is no way to find the who hold the refcount of the page effectively. This patch introduces feature to keep tracking page's pinner. All get_page sites are vulnerable to pin a page for a long time but the cost to keep track it would be significat since get_page is the most frequent kernel operation. Furthermore, the page could be not user page but kernel page which is not related to the page migration failure. So, this patch keeps tracking only get_user_pages/follow_page with (FOLL_GET|PIN friends because they are the very common APIs to pin user pages which could cause migration failure and the less frequent than get_page so runtime cost wouldn't be that big but could cover many cases effectively. This patch also introduces put_user_page API. It aims for attributing "the pinner releases the page from now on" while it release the page refcount. Thus, any user of get_user_pages/follow_page(FOLL_GET) must use put_user_page as pair of those functions. Otherwise, page_pinner will treat them long term pinner as false postive but nothing should affect stability. * $debugfs/page_pinner/threshold It indicates threshold(microsecond) to flag long term pinning. It's configurable(Default is 300000us). Once you write new value to the threshold, old data will clear. * $debugfs/page_pinner/longterm_pinner It shows call sites where the duration of pinning was greater than the threshold. Internally, it uses a static array to keep 4096 elements and overwrites old ones once overflow happens. Therefore, you could lose some information. example) Page pinned ts 76953865787 us count 1 PFN 9856945 Block 9625 type Movable Flags 0x8000000000080014(uptodate|lru|swapbacked) __set_page_pinner+0x34/0xcc try_grab_page+0x19c/0x1a0 follow_page_pte+0x1c0/0x33c follow_page_mask+0xc0/0xc8 __get_user_pages+0x178/0x414 __gup_longterm_locked+0x80/0x148 internal_get_user_pages_fast+0x140/0x174 pin_user_pages_fast+0x24/0x40 CCC BBB AAA __arm64_sys_ioctl+0x94/0xd0 el0_svc_common+0xa4/0x180 do_el0_svc+0x28/0x88 el0_svc+0x14/0x24 note: page_pinner doesn't guarantee attributing/unattributing are atomic if they happen at the same time. It's just best effort so false-positive could happen. Bug: 183414571 Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Minchan Kim <minchan@google.com> Change-Id: Ife37ec360eef993d390b9c131732218a4dfd2f04
2021-03-18 09:56:10 -07:00 · 2021-03-18 09:56:10 -07:00 · 6e12c5b7d4
commit 6e12c5b7d4
parent 926cf69af5
10 changed files with 491 additions and 6 deletions
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/pgtable.h>
 #include <linux/kasan.h>
+#include <linux/page_pinner.h>
 #include <linux/android_kabi.h>

 struct mempolicy;
@ -1279,6 +1280,7 @@ static inline void put_page(struct page *page)
 */
 #define GUP_PIN_COUNTING_BIAS (1U << 10)

+void put_user_page(struct page *page);
 void unpin_user_page(struct page *page);
 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 				 bool make_dirty);
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@ -19,6 +19,10 @@ struct page_ext_operations {
 enum page_ext_flags {
 	PAGE_EXT_OWNER,
 	PAGE_EXT_OWNER_ALLOCATED,
+#if defined(CONFIG_PAGE_PINNER)
+	/* page refcount was increased by GUP or follow_page(FOLL_GET) */
+	PAGE_EXT_GET,
+#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
--- a/include/linux/page_pinner.h
+++ b/include/linux/page_pinner.h
@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_PAGE_PINNER_H
+#define __LINUX_PAGE_PINNER_H
+
+#include <linux/jump_label.h>
+
+#ifdef CONFIG_PAGE_PINNER
+extern struct static_key_false page_pinner_inited;
+extern struct page_ext_operations page_pinner_ops;
+
+extern void __reset_page_pinner(struct page *page, unsigned int order, bool free);
+extern void __set_page_pinner(struct page *page, unsigned int order);
+extern void __dump_page_pinner(struct page *page);
+
+static inline void reset_page_pinner(struct page *page, unsigned int order)
+{
+	if (static_branch_unlikely(&page_pinner_inited))
+		__reset_page_pinner(page, order, false);
+}
+
+static inline void free_page_pinner(struct page *page, unsigned int order)
+{
+	if (static_branch_unlikely(&page_pinner_inited))
+		__reset_page_pinner(page, order, true);
+}
+
+static inline void set_page_pinner(struct page *page, unsigned int order)
+{
+	if (static_branch_unlikely(&page_pinner_inited))
+		__set_page_pinner(page, order);
+}
+
+static inline void dump_page_pinner(struct page *page)
+{
+	if (static_branch_unlikely(&page_pinner_inited))
+		__dump_page_pinner(page);
+}
+#else
+static inline void reset_page_pinner(struct page *page, unsigned int order)
+{
+}
+static inline void free_page_pinner(struct page *page, unsigned int order)
+{
+}
+static inline void set_page_pinner(struct page *page, unsigned int order)
+{
+}
+static inline void dump_page_pinner(struct page *page)
+{
+}
+#endif /* CONFIG_PAGE_PINNER */
+#endif /* __LINUX_PAGE_PINNER_H */
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@ -62,6 +62,22 @@ config PAGE_OWNER

 	  If unsure, say N.

+config PAGE_PINNER
+	bool "Track page pinner"
+	depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+	select DEBUG_FS
+	select STACKTRACE
+	select STACKDEPOT
+	select PAGE_EXTENSION
+	help
+	  This keeps track of what call chain is the pinner of a page, may
+	  help to find page migration failures. Even if you include this
+	  feature in your build, it is disabled by default. You should pass
+	  "page_pinner=on" to boot parameter in order to enable it. Eats
+	  a fair amount of memory if enabled.
+
+	  If unsure, say N.
+
 config PAGE_POISONING
 	bool "Poison pages after freeing"
 	help
--- a/mm/Makefile
+++ b/mm/Makefile
@ -99,6 +99,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
 obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
+obj-$(CONFIG_PAGE_PINNER) += page_pinner.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZPOOL)	+= zpool.o
--- a/mm/debug.c
+++ b/mm/debug.c
@ -13,6 +13,7 @@
 #include <trace/events/mmflags.h>
 #include <linux/migrate.h>
 #include <linux/page_owner.h>
+#include <linux/page_pinner.h>
 #include <linux/ctype.h>

 #include "internal.h"
@ -191,6 +192,7 @@ void dump_page(struct page *page, const char *reason)
 {
 	__dump_page(page, reason);
 	dump_page_owner(page);
+	dump_page_pinner(page);
 }
 EXPORT_SYMBOL(dump_page);

--- a/mm/gup.c
+++ b/mm/gup.c
@ -18,6 +18,8 @@
 #include <linux/mm_inline.h>
 #include <linux/sched/mm.h>

+#include <linux/page_pinner.h>
+
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>

@ -82,9 +84,12 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page,
 							  int refs,
 							  unsigned int flags)
 {
-	if (flags & FOLL_GET)
-		return try_get_compound_head(page, refs);
-	else if (flags & FOLL_PIN) {
+	if (flags & FOLL_GET) {
+		struct page *head = try_get_compound_head(page, refs);
+		if (head)
+			set_page_pinner(head, compound_order(head));
+		return head;
+	} else if (flags & FOLL_PIN) {
 		int orig_refs = refs;

 		/*
@ -142,6 +147,9 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
 	 */
 	if (refs > 1)
 		page_ref_sub(page, refs - 1);
+
+	if (flags & FOLL_GET)
+		reset_page_pinner(page, compound_order(page));
 	put_page(page);
 }

@ -170,9 +178,15 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
 {
 	WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));

-	if (flags & FOLL_GET)
-		return try_get_page(page);
-	else if (flags & FOLL_PIN) {
+	if (flags & FOLL_GET) {
+		bool ret = try_get_page(page);
+
+		if (ret) {
+			page = compound_head(page);
+			set_page_pinner(page, compound_order(page));
+		}
+		return ret;
+	} else if (flags & FOLL_PIN) {
 		int refs = 1;

 		page = compound_head(page);
@ -214,6 +228,24 @@ void unpin_user_page(struct page *page)
 }
 EXPORT_SYMBOL(unpin_user_page);

+/*
+ * put_user_page() - release a page obtained using get_user_pages() or
+ *                   follow_page(FOLL_GET)
+ * @page:            pointer to page to be released
+ *
+ * Pages that were obtained via get_user_pages()/follow_page(FOLL_GET) must be
+ * released via put_user_page.
+ * note: If it's not a page from GUP or follow_page(FOLL_GET), it's harmless.
+ */
+void put_user_page(struct page *page)
+{
+	struct page *head = compound_head(page);
+
+	reset_page_pinner(head, compound_order(head));
+	put_page(page);
+}
+EXPORT_SYMBOL(put_user_page);
+
 /**
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@ -62,6 +62,7 @@
 #include <linux/sched/rt.h>
 #include <linux/sched/mm.h>
 #include <linux/page_owner.h>
+#include <linux/page_pinner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
@ -1285,6 +1286,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 		if (memcg_kmem_enabled() && PageKmemcg(page))
 			__memcg_kmem_uncharge_page(page, order);
 		reset_page_owner(page, order);
+		free_page_pinner(page, order);
 		return false;
 	}

@ -1322,6 +1324,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	page_cpupid_reset_last(page);
 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	reset_page_owner(page, order);
+	free_page_pinner(page, order);

 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@ -65,6 +65,9 @@ static struct page_ext_operations *page_ext_ops[] = {
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	&page_idle_ops,
 #endif
+#ifdef CONFIG_PAGE_PINNER
+	&page_pinner_ops,
+#endif
 };

 unsigned long page_ext_size = sizeof(struct page_ext);
--- a/mm/page_pinner.c
+++ b/mm/page_pinner.c
@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/memblock.h>
+#include <linux/stacktrace.h>
+#include <linux/page_pinner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
+#include <linux/stackdepot.h>
+#include <linux/seq_file.h>
+#include <linux/sched/clock.h>
+
+#include "internal.h"
+
+#define PAGE_PINNER_STACK_DEPTH 16
+#define LONTERM_PIN_BUCKETS	4096
+
+struct page_pinner {
+	depot_stack_handle_t handle;
+	s64 ts_usec;
+	atomic_t count;
+};
+
+struct captured_pinner {
+	depot_stack_handle_t handle;
+	s64 ts_usec;
+	int page_mt;
+	unsigned long page_flags;
+	unsigned long pfn;
+};
+
+struct longterm_pinner {
+	spinlock_t lock;
+	unsigned int index;
+	struct captured_pinner pinner[LONTERM_PIN_BUCKETS];
+};
+
+static struct longterm_pinner lt_pinner = {
+	.lock = __SPIN_LOCK_UNLOCKED(lt_pinner.lock),
+};
+
+static s64 threshold_usec = 300000;
+
+static bool page_pinner_enabled;
+DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
+
+static depot_stack_handle_t failure_handle;
+
+static int __init early_page_pinner_param(char *buf)
+{
+	page_pinner_enabled = true;
+	return 0;
+}
+early_param("page_pinner", early_page_pinner_param);
+
+static bool need_page_pinner(void)
+{
+	return page_pinner_enabled;
+}
+
+static noinline void register_failure_stack(void)
+{
+	unsigned long entries[4];
+	unsigned int nr_entries;
+
+	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+	failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
+}
+
+static void init_page_pinner(void)
+{
+	if (!page_pinner_enabled)
+		return;
+
+	register_failure_stack();
+	static_branch_enable(&page_pinner_inited);
+}
+
+struct page_ext_operations page_pinner_ops = {
+	.size = sizeof(struct page_pinner),
+	.need = need_page_pinner,
+	.init = init_page_pinner,
+};
+
+static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
+{
+	return (void *)page_ext + page_pinner_ops.offset;
+}
+
+static noinline depot_stack_handle_t save_stack(gfp_t flags)
+{
+	unsigned long entries[PAGE_PINNER_STACK_DEPTH];
+	depot_stack_handle_t handle;
+	unsigned int nr_entries;
+
+	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
+	handle = stack_depot_save(entries, nr_entries, flags);
+	if (!handle)
+		handle = failure_handle;
+
+	return handle;
+}
+
+static void check_lonterm_pin(struct page_pinner *page_pinner,
+			      struct page *page)
+{
+	s64 now, delta = 0;
+	unsigned long flags;
+	unsigned int idx;
+
+	now = ktime_to_us(ktime_get_boottime());
+
+	/* get/put_page can be raced. Ignore that case */
+	if (page_pinner->ts_usec < now)
+		delta = now - page_pinner->ts_usec;
+
+	if (delta <= threshold_usec)
+		return;
+
+	spin_lock_irqsave(&lt_pinner.lock, flags);
+	idx = lt_pinner.index++;
+	lt_pinner.index %= LONTERM_PIN_BUCKETS;
+
+	lt_pinner.pinner[idx].handle = page_pinner->handle;
+	lt_pinner.pinner[idx].ts_usec = delta;
+	lt_pinner.pinner[idx].page_flags = page->flags;
+	lt_pinner.pinner[idx].page_mt = get_pageblock_migratetype(page);
+	lt_pinner.pinner[idx].pfn = page_to_pfn(page);
+	spin_unlock_irqrestore(&lt_pinner.lock, flags);
+
+}
+
+void __reset_page_pinner(struct page *page, unsigned int order, bool free)
+{
+	struct page_pinner *page_pinner;
+	struct page_ext *page_ext;
+	int i;
+
+	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
+
+	for (i = 0; i < (1 << order); i++) {
+		if (!test_bit(PAGE_EXT_GET, &page_ext->flags))
+			continue;
+
+		page_pinner = get_page_pinner(page_ext);
+		if (free) {
+			WARN_ON_ONCE(atomic_read(&page_pinner->count));
+			atomic_set(&page_pinner->count, 0);
+		} else {
+			WARN_ON_ONCE(atomic_dec_if_positive(
+				     &page_pinner->count) < 0);
+			check_lonterm_pin(page_pinner, page);
+		}
+		clear_bit(PAGE_EXT_GET, &page_ext->flags);
+		page_ext = page_ext_next(page_ext);
+	}
+}
+
+static inline void __set_page_pinner_handle(struct page *page,
+	struct page_ext *page_ext, depot_stack_handle_t handle,
+	unsigned int order)
+{
+	struct page_pinner *page_pinner;
+	int i;
+	s64 usec = ktime_to_us(ktime_get_boottime());
+
+	for (i = 0; i < (1 << order); i++) {
+		page_pinner = get_page_pinner(page_ext);
+		page_pinner->handle = handle;
+		page_pinner->ts_usec = usec;
+		set_bit(PAGE_EXT_GET, &page_ext->flags);
+		atomic_inc(&page_pinner->count);
+		page_ext = page_ext_next(page_ext);
+	}
+}
+
+noinline void __set_page_pinner(struct page *page, unsigned int order)
+{
+	struct page_ext *page_ext = lookup_page_ext(page);
+	depot_stack_handle_t handle;
+
+	if (unlikely(!page_ext))
+		return;
+
+	handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
+	__set_page_pinner_handle(page, page_ext, handle, order);
+}
+
+static ssize_t
+print_page_pinner(char __user *buf, size_t count, unsigned long pfn,
+		int pageblock_mt, unsigned long page_flags, s64 ts_usec,
+		depot_stack_handle_t handle, int shared_count)
+{
+	int ret;
+	unsigned long *entries;
+	unsigned int nr_entries;
+	char *kbuf;
+
+	count = min_t(size_t, count, PAGE_SIZE);
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	ret = snprintf(kbuf, count,
+			"Page pinned ts %lld us count %d\n",
+			ts_usec, shared_count);
+
+	if (ret >= count)
+		goto err;
+
+	/* Print information relevant to grouping pages by mobility */
+	ret += snprintf(kbuf + ret, count - ret,
+			"PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
+			pfn,
+			pfn >> pageblock_order,
+			migratetype_names[pageblock_mt],
+			page_flags, &page_flags);
+
+	if (ret >= count)
+		goto err;
+
+	nr_entries = stack_depot_fetch(handle, &entries);
+	ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
+				   nr_entries, 0);
+	if (ret >= count)
+		goto err;
+
+	ret += snprintf(kbuf + ret, count - ret, "\n");
+	if (ret >= count)
+		goto err;
+
+	if (copy_to_user(buf, kbuf, ret))
+		ret = -EFAULT;
+
+	kfree(kbuf);
+	return ret;
+
+err:
+	kfree(kbuf);
+	return -ENOMEM;
+}
+
+void __dump_page_pinner(struct page *page)
+{
+	struct page_ext *page_ext = lookup_page_ext(page);
+	struct page_pinner *page_pinner;
+	depot_stack_handle_t handle;
+	unsigned long *entries;
+	unsigned int nr_entries;
+	int pageblock_mt;
+	unsigned long pfn;
+	int count;
+
+	if (unlikely(!page_ext)) {
+		pr_alert("There is not page extension available.\n");
+		return;
+	}
+
+	page_pinner = get_page_pinner(page_ext);
+
+	count = atomic_read(&page_pinner->count);
+	if (!count) {
+		pr_alert("page_pinner info is not present (never set?)\n");
+		return;
+	}
+
+	pfn = page_to_pfn(page);
+	pr_alert("page last pinned ts %lld count %d\n",
+			page_pinner->ts_usec,
+			count);
+
+	pageblock_mt = get_pageblock_migratetype(page);
+	pr_alert("PFN %lu Block %lu type %s Flags %#lx(%pGp)\n",
+			pfn,
+			pfn >> pageblock_order,
+			migratetype_names[pageblock_mt],
+			page->flags, &page->flags);
+
+	handle = READ_ONCE(page_pinner->handle);
+	if (!handle) {
+		pr_alert("page_pinner allocation stack trace missing\n");
+	} else {
+		nr_entries = stack_depot_fetch(handle, &entries);
+		stack_trace_print(entries, nr_entries, 0);
+	}
+}
+
+static ssize_t
+read_longterm_page_pinner(struct file *file, char __user *buf, size_t count,
+			  loff_t *ppos)
+{
+	loff_t i, idx;
+	struct captured_pinner record;
+	unsigned long flags;
+
+	if (!static_branch_unlikely(&page_pinner_inited))
+		return -EINVAL;
+
+	if (*ppos >= LONTERM_PIN_BUCKETS)
+		return 0;
+
+	i = *ppos;
+	*ppos = i + 1;
+
+	/*
+	 * reading the records in the reverse order with newest one
+	 * being read first followed by older ones
+	 */
+	idx = (lt_pinner.index - 1 - i + LONTERM_PIN_BUCKETS) %
+	       LONTERM_PIN_BUCKETS;
+	spin_lock_irqsave(&lt_pinner.lock, flags);
+	record = lt_pinner.pinner[idx];
+	spin_unlock_irqrestore(&lt_pinner.lock, flags);
+	if (!record.handle)
+		return 0;
+
+	return print_page_pinner(buf, count, record.pfn, record.page_mt,
+				 record.page_flags, record.ts_usec,
+				 record.handle, 0);
+}
+
+static const struct file_operations proc_longterm_pinner_operations = {
+	.read		= read_longterm_page_pinner,
+};
+
+static int pp_threshold_set(void *data, unsigned long long val)
+{
+	unsigned long flags;
+
+	threshold_usec = (s64)val;
+
+	spin_lock_irqsave(&lt_pinner.lock, flags);
+	memset(lt_pinner.pinner, 0,
+	       sizeof(struct captured_pinner) * LONTERM_PIN_BUCKETS);
+	lt_pinner.index = 0;
+	spin_unlock_irqrestore(&lt_pinner.lock, flags);
+	return 0;
+}
+
+static int pp_threshold_get(void *data, unsigned long long *val)
+{
+	*val = (unsigned long long)threshold_usec;
+
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(pp_threshold_fops, pp_threshold_get,
+			 pp_threshold_set, "%lld\n");
+
+static int __init page_pinner_init(void)
+{
+	struct dentry *pp_debugfs_root;
+
+	if (!static_branch_unlikely(&page_pinner_inited))
+		return 0;
+
+	pr_info("page_pinner enabled\n");
+	pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
+
+	debugfs_create_file("longterm_pinner", 0400, pp_debugfs_root, NULL,
+			    &proc_longterm_pinner_operations);
+
+	debugfs_create_file("threshold", 0444, pp_debugfs_root, NULL,
+			    &pp_threshold_fops);
+	return 0;
+}
+late_initcall(page_pinner_init)