ANDROID: mm: introduce page_pinner

For CMA allocation, it's really critical to migrate a page but
sometimes it fails. One of the reasons is some driver holds a
page refcount for a long time so VM couldn't migrate the page
at that time.

The concern here is there is no way to find the who hold the
refcount of the page effectively. This patch introduces feature
to keep tracking page's pinner. All get_page sites are vulnerable
to pin a page for a long time but the cost to keep track it would
be significat since get_page is the most frequent kernel operation.
Furthermore, the page could be not user page but kernel page which
is not related to the page migration failure.

Thus, this patch keeps tracks of only migration failed pages to
reduce runtime cost. Once page migration fails in CMA allocation
path, those pages are marked as "migration failure" and every
put_page operation against those pages, callstack of the put
are recorded into page_pinner buffer. Later, admin can see
what pages were failed and who released the refcount since the
failure. It really helps effectively to find out longtime refcount
holder to prevent the page migration.

note: page_pinner doesn't guarantee attributing/unattributing are
atomic if they happen at the same time. It's just best effort so
false-positive could happen.

Bug: 183414571
BUg: 240196534
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Minchan Kim <minchan@google.com>
Change-Id: I603d0c0122734c377db6b1eb95848a6f734173a0
(cherry picked from commit 898cfbf094a2fc13c67fab5b5d3c916f0139833a)
This commit is contained in:
Minchan Kim 2022-09-20 14:42:50 -07:00 committed by Treehugger Robot
parent 4c868837fa
commit e6e6e1273d
9 changed files with 512 additions and 1 deletions

View File

@ -701,6 +701,7 @@ CONFIG_UBSAN_LOCAL_BOUNDS=y
# CONFIG_UBSAN_BOOL is not set
# CONFIG_UBSAN_ENUM is not set
CONFIG_PAGE_OWNER=y
CONFIG_PAGE_PINNER=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_KASAN=y

View File

@ -19,6 +19,10 @@ struct page_ext_operations {
enum page_ext_flags {
PAGE_EXT_OWNER,
PAGE_EXT_OWNER_ALLOCATED,
#if defined(CONFIG_PAGE_PINNER)
/* page migration failed */
PAGE_EXT_PINNER_MIGRATION_FAILED,
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,

View File

@ -0,0 +1,48 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PAGE_PINNER_H
#define __LINUX_PAGE_PINNER_H
#include <linux/jump_label.h>
#ifdef CONFIG_PAGE_PINNER
extern struct static_key_false page_pinner_inited;
extern struct static_key_true failure_tracking;
extern struct page_ext_operations page_pinner_ops;
extern void __free_page_pinner(struct page *page, unsigned int order);
void __page_pinner_failure_detect(struct page *page);
void __page_pinner_put_page(struct page *page);
static inline void free_page_pinner(struct page *page, unsigned int order)
{
if (static_branch_unlikely(&page_pinner_inited))
__free_page_pinner(page, order);
}
static inline void page_pinner_put_page(struct page *page)
{
if (!static_branch_unlikely(&failure_tracking))
return;
__page_pinner_put_page(page);
}
static inline void page_pinner_failure_detect(struct page *page)
{
if (!static_branch_unlikely(&failure_tracking))
return;
__page_pinner_failure_detect(page);
}
#else
static inline void free_page_pinner(struct page *page, unsigned int order)
{
}
static inline void page_pinner_put_page(struct page *page)
{
}
static inline void page_pinner_failure_detect(struct page *page)
{
}
#endif /* CONFIG_PAGE_PINNER */
#endif /* __LINUX_PAGE_PINNER_H */

View File

@ -119,6 +119,22 @@ config PAGE_TABLE_CHECK_ENFORCED
If unsure say "n".
config PAGE_PINNER
bool "Track page pinner"
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
select DEBUG_FS
select STACKTRACE
select STACKDEPOT
select PAGE_EXTENSION
help
This keeps track of what call chain is the pinner of a page, may
help to find page migration failures. Even if you include this
feature in your build, it is disabled by default. You should pass
"page_pinner=on" to boot parameter in order to enable it. Eats
a fair amount of memory if enabled.
If unsure, say N.
config PAGE_POISONING
bool "Poison pages after freeing"
help

View File

@ -109,6 +109,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_PAGE_PINNER) += page_pinner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o

View File

@ -1438,6 +1438,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
free_page_pinner(page, order);
page_table_check_free(page, order);
return false;
}
@ -1478,6 +1479,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
free_page_pinner(page, order);
page_table_check_free(page, order);
if (!PageHighMem(page)) {
@ -9310,8 +9312,17 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
lru_cache_enable();
if (ret < 0) {
if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) {
struct page *page;
alloc_contig_dump_pages(&cc->migratepages);
list_for_each_entry(page, &cc->migratepages, lru) {
/* The page will be freed by putback_movable_pages soon */
if (page_count(page) == 1)
continue;
page_pinner_failure_detect(page);
}
}
putback_movable_pages(&cc->migratepages);
return ret;
}

View File

@ -7,6 +7,7 @@
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_pinner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate.h>
@ -81,6 +82,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
&page_idle_ops,
#endif
#ifdef CONFIG_PAGE_PINNER
&page_pinner_ops,
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
#endif

View File

@ -9,6 +9,7 @@
#include <linux/memory.h>
#include <linux/hugetlb.h>
#include <linux/page_owner.h>
#include <linux/page_pinner.h>
#include <linux/migrate.h>
#include "internal.h"
@ -666,6 +667,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
out:
trace_test_pages_isolated(start_pfn, end_pfn, pfn);
if (pfn < end_pfn)
page_pinner_failure_detect(pfn_to_page(pfn));
return ret;
}

423
mm/page_pinner.c Normal file
View File

@ -0,0 +1,423 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/memblock.h>
#include <linux/stacktrace.h>
#include <linux/page_pinner.h>
#include <linux/jump_label.h>
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
#include <linux/sched/clock.h>
#include "internal.h"
#define PAGE_PINNER_STACK_DEPTH 16
static unsigned long pp_buf_size = 4096;
struct page_pinner {
depot_stack_handle_t handle;
u64 ts_usec;
atomic_t count;
};
enum pp_state {
PP_PUT,
PP_FREE,
PP_FAIL_DETECTED,
};
struct captured_pinner {
depot_stack_handle_t handle;
union {
u64 ts_usec;
u64 elapsed;
};
/* struct page fields */
unsigned long pfn;
int count;
int mapcount;
struct address_space *mapping;
unsigned long flags;
enum pp_state state;
};
struct page_pinner_buffer {
spinlock_t lock;
unsigned long index;
struct captured_pinner *buffer;
};
/* alloc_contig failed pinner */
static struct page_pinner_buffer pp_buffer;
static bool page_pinner_enabled;
DEFINE_STATIC_KEY_FALSE(page_pinner_inited);
DEFINE_STATIC_KEY_TRUE(failure_tracking);
EXPORT_SYMBOL_GPL(failure_tracking);
static depot_stack_handle_t failure_handle;
static int __init early_page_pinner_param(char *buf)
{
page_pinner_enabled = true;
return 0;
}
early_param("page_pinner", early_page_pinner_param);
static bool need_page_pinner(void)
{
return page_pinner_enabled;
}
static noinline void register_failure_stack(void)
{
unsigned long entries[4];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL);
}
static void init_page_pinner(void)
{
if (!page_pinner_enabled)
return;
register_failure_stack();
static_branch_enable(&page_pinner_inited);
}
struct page_ext_operations page_pinner_ops = {
.size = sizeof(struct page_pinner),
.need = need_page_pinner,
.init = init_page_pinner,
};
static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext)
{
return (void *)page_ext + page_pinner_ops.offset;
}
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_PINNER_STACK_DEPTH];
depot_stack_handle_t handle;
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
return handle;
}
static void capture_page_state(struct page *page,
struct captured_pinner *record)
{
record->flags = page->flags;
record->mapping = page_mapping(page);
record->pfn = page_to_pfn(page);
record->count = page_count(page);
record->mapcount = page_mapcount(page);
}
static void add_record(struct page_pinner_buffer *pp_buf,
struct captured_pinner *record)
{
unsigned long flags;
unsigned int idx;
spin_lock_irqsave(&pp_buf->lock, flags);
idx = pp_buf->index++;
pp_buf->index %= pp_buf_size;
pp_buf->buffer[idx] = *record;
spin_unlock_irqrestore(&pp_buf->lock, flags);
}
void __free_page_pinner(struct page *page, unsigned int order)
{
struct page_pinner *page_pinner;
struct page_ext *page_ext;
int i;
/* free_page could be called before buffer is initialized */
if (!pp_buffer.buffer)
return;
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
for (i = 0; i < (1 << order); i++) {
struct captured_pinner record;
if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
continue;
page_pinner = get_page_pinner(page_ext);
/* record page free call path */
page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
continue;
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
record.ts_usec = (u64)ktime_to_us(ktime_get_boottime());
record.state = PP_FREE;
capture_page_state(page, &record);
add_record(&pp_buffer, &record);
atomic_set(&page_pinner->count, 0);
page_pinner->ts_usec = 0;
clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
page_ext = page_ext_next(page_ext);
}
}
static ssize_t
print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record)
{
int ret;
unsigned long *entries;
unsigned int nr_entries;
char *kbuf;
count = min_t(size_t, count, PAGE_SIZE);
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
if (record->state == PP_PUT) {
ret = snprintf(kbuf, count, "At least, pinned for %llu us\n",
record->elapsed);
} else {
u64 ts_usec = record->ts_usec;
unsigned long rem_usec = do_div(ts_usec, 1000000);
ret = snprintf(kbuf, count,
"%s [%5lu.%06lu]\n",
record->state == PP_FREE ? "Freed at" :
"Failure detected at",
(unsigned long)ts_usec, rem_usec);
}
if (ret >= count)
goto err;
/* Print information relevant to grouping pages by mobility */
ret += snprintf(kbuf + ret, count - ret,
"PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n",
record->pfn,
record->pfn >> pageblock_order,
record->count, record->mapcount,
record->mapping,
record->flags, &record->flags);
if (ret >= count)
goto err;
nr_entries = stack_depot_fetch(record->handle, &entries);
ret += stack_trace_snprint(kbuf + ret, count - ret, entries,
nr_entries, 0);
if (ret >= count)
goto err;
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
if (copy_to_user(buf, kbuf, ret))
ret = -EFAULT;
kfree(kbuf);
return ret;
err:
kfree(kbuf);
return -ENOMEM;
}
void __page_pinner_failure_detect(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
struct page_pinner *page_pinner;
struct captured_pinner record;
u64 now;
if (unlikely(!page_ext))
return;
if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
return;
now = (u64)ktime_to_us(ktime_get_boottime());
page_pinner = get_page_pinner(page_ext);
if (!page_pinner->ts_usec)
page_pinner->ts_usec = now;
set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags);
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
record.ts_usec = now;
record.state = PP_FAIL_DETECTED;
capture_page_state(page, &record);
add_record(&pp_buffer, &record);
}
EXPORT_SYMBOL_GPL(__page_pinner_failure_detect);
void __page_pinner_put_page(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
struct page_pinner *page_pinner;
struct captured_pinner record;
u64 now, ts_usec;
if (unlikely(!page_ext))
return;
if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags))
return;
page_pinner = get_page_pinner(page_ext);
record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN);
now = (u64)ktime_to_us(ktime_get_boottime());
ts_usec = page_pinner->ts_usec;
if (now > ts_usec)
record.elapsed = now - ts_usec;
else
record.elapsed = 0;
record.state = PP_PUT;
capture_page_state(page, &record);
add_record(&pp_buffer, &record);
}
EXPORT_SYMBOL_GPL(__page_pinner_put_page);
static ssize_t read_buffer(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
u64 tmp;
loff_t i, idx;
struct captured_pinner record;
unsigned long flags;
if (!static_branch_unlikely(&failure_tracking))
return -EINVAL;
if (*ppos >= pp_buf_size)
return 0;
i = *ppos;
*ppos = i + 1;
/*
* reading the records in the reverse order with newest one
* being read first followed by older ones
*/
tmp = pp_buffer.index - 1 - i + pp_buf_size;
idx = do_div(tmp, pp_buf_size);
spin_lock_irqsave(&pp_buffer.lock, flags);
record = pp_buffer.buffer[idx];
spin_unlock_irqrestore(&pp_buffer.lock, flags);
if (!record.handle)
return 0;
return print_page_pinner(buf, count, &record);
}
static const struct file_operations proc_buffer_operations = {
.read = read_buffer,
};
static int failure_tracking_set(void *data, u64 val)
{
bool on;
on = (bool)val;
if (on)
static_branch_enable(&failure_tracking);
else
static_branch_disable(&failure_tracking);
return 0;
}
static int failure_tracking_get(void *data, u64 *val)
{
*val = static_branch_unlikely(&failure_tracking);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops,
failure_tracking_get,
failure_tracking_set, "%llu\n");
static int buffer_size_set(void *data, u64 val)
{
unsigned long flags;
struct captured_pinner *new, *old;
new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL);
if (!new)
return -ENOMEM;
spin_lock_irqsave(&pp_buffer.lock, flags);
old = pp_buffer.buffer;
pp_buffer.buffer = new;
pp_buffer.index = 0;
pp_buf_size = val;
spin_unlock_irqrestore(&pp_buffer.lock, flags);
kvfree(old);
return 0;
}
static int buffer_size_get(void *data, u64 *val)
{
*val = pp_buf_size;
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops,
buffer_size_get,
buffer_size_set, "%llu\n");
static int __init page_pinner_init(void)
{
struct dentry *pp_debugfs_root;
if (!static_branch_unlikely(&page_pinner_inited))
return 0;
pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer),
GFP_KERNEL);
if (!pp_buffer.buffer) {
pr_info("page_pinner disabled due to failure of buffer allocation\n");
return 1;
}
spin_lock_init(&pp_buffer.lock);
pp_buffer.index = 0;
pr_info("page_pinner enabled\n");
pp_debugfs_root = debugfs_create_dir("page_pinner", NULL);
debugfs_create_file("buffer", 0444,
pp_debugfs_root, NULL,
&proc_buffer_operations);
debugfs_create_file("failure_tracking", 0644,
pp_debugfs_root, NULL,
&failure_tracking_fops);
debugfs_create_file("buffer_size", 0644,
pp_debugfs_root, NULL,
&buffer_size_fops);
return 0;
}
late_initcall(page_pinner_init)