diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig index 260befab3d4b..e139d2c9d10f 100644 --- a/arch/arm64/configs/gki_defconfig +++ b/arch/arm64/configs/gki_defconfig @@ -701,6 +701,7 @@ CONFIG_UBSAN_LOCAL_BOUNDS=y # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set CONFIG_PAGE_OWNER=y +CONFIG_PAGE_PINNER=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_KASAN=y diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 22be4582faae..6136338b2b17 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -19,6 +19,10 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, +#if defined(CONFIG_PAGE_PINNER) + /* page migration failed */ + PAGE_EXT_PINNER_MIGRATION_FAILED, +#endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, diff --git a/include/linux/page_pinner.h b/include/linux/page_pinner.h new file mode 100644 index 000000000000..4d574d1ced59 --- /dev/null +++ b/include/linux/page_pinner.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_PAGE_PINNER_H +#define __LINUX_PAGE_PINNER_H + +#include + +#ifdef CONFIG_PAGE_PINNER +extern struct static_key_false page_pinner_inited; +extern struct static_key_true failure_tracking; +extern struct page_ext_operations page_pinner_ops; + +extern void __free_page_pinner(struct page *page, unsigned int order); +void __page_pinner_failure_detect(struct page *page); +void __page_pinner_put_page(struct page *page); + +static inline void free_page_pinner(struct page *page, unsigned int order) +{ + if (static_branch_unlikely(&page_pinner_inited)) + __free_page_pinner(page, order); +} + +static inline void page_pinner_put_page(struct page *page) +{ + if (!static_branch_unlikely(&failure_tracking)) + return; + + __page_pinner_put_page(page); +} + +static inline void page_pinner_failure_detect(struct page *page) +{ + if (!static_branch_unlikely(&failure_tracking)) + return; + + __page_pinner_failure_detect(page); +} +#else +static inline void free_page_pinner(struct page *page, unsigned int order) +{ +} +static inline void page_pinner_put_page(struct page *page) +{ +} +static inline void page_pinner_failure_detect(struct page *page) +{ +} +#endif /* CONFIG_PAGE_PINNER */ +#endif /* __LINUX_PAGE_PINNER_H */ diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index ce8dded36de9..2eae432e042c 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -119,6 +119,22 @@ config PAGE_TABLE_CHECK_ENFORCED If unsure say "n". +config PAGE_PINNER + bool "Track page pinner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the pinner of a page, may + help to find page migration failures. Even if you include this + feature in your build, it is disabled by default. You should pass + "page_pinner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. + + If unsure, say N. + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index 5ef58b2081d4..6878edc7c7d0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,6 +109,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o +obj-$(CONFIG_PAGE_PINNER) += page_pinner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4afcd84a422c..104003f72893 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1438,6 +1438,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + free_page_pinner(page, order); page_table_check_free(page, order); return false; } @@ -1478,6 +1479,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + free_page_pinner(page, order); page_table_check_free(page, order); if (!PageHighMem(page)) { @@ -9310,8 +9312,17 @@ int __alloc_contig_migrate_range(struct compact_control *cc, lru_cache_enable(); if (ret < 0) { - if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) + if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) { + struct page *page; + alloc_contig_dump_pages(&cc->migratepages); + list_for_each_entry(page, &cc->migratepages, lru) { + /* The page will be freed by putback_movable_pages soon */ + if (page_count(page) == 1) + continue; + page_pinner_failure_detect(page); + } + } putback_movable_pages(&cc->migratepages); return ret; } diff --git a/mm/page_ext.c b/mm/page_ext.c index 1b3a67b9d5e0..6fcae7c60101 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_PINNER + &page_pinner_ops, +#endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 47fbc1696466..90e6a0090085 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "internal.h" @@ -666,6 +667,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, out: trace_test_pages_isolated(start_pfn, end_pfn, pfn); + if (pfn < end_pfn) + page_pinner_failure_detect(pfn_to_page(pfn)); return ret; } diff --git a/mm/page_pinner.c b/mm/page_pinner.c new file mode 100644 index 000000000000..2a25a3720d11 --- /dev/null +++ b/mm/page_pinner.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define PAGE_PINNER_STACK_DEPTH 16 +static unsigned long pp_buf_size = 4096; + +struct page_pinner { + depot_stack_handle_t handle; + u64 ts_usec; + atomic_t count; +}; + +enum pp_state { + PP_PUT, + PP_FREE, + PP_FAIL_DETECTED, +}; + +struct captured_pinner { + depot_stack_handle_t handle; + union { + u64 ts_usec; + u64 elapsed; + }; + + /* struct page fields */ + unsigned long pfn; + int count; + int mapcount; + struct address_space *mapping; + unsigned long flags; + enum pp_state state; +}; + +struct page_pinner_buffer { + spinlock_t lock; + unsigned long index; + struct captured_pinner *buffer; +}; + +/* alloc_contig failed pinner */ +static struct page_pinner_buffer pp_buffer; + +static bool page_pinner_enabled; +DEFINE_STATIC_KEY_FALSE(page_pinner_inited); + +DEFINE_STATIC_KEY_TRUE(failure_tracking); +EXPORT_SYMBOL_GPL(failure_tracking); + +static depot_stack_handle_t failure_handle; + +static int __init early_page_pinner_param(char *buf) +{ + page_pinner_enabled = true; + return 0; +} +early_param("page_pinner", early_page_pinner_param); + +static bool need_page_pinner(void) +{ + return page_pinner_enabled; +} + +static noinline void register_failure_stack(void) +{ + unsigned long entries[4]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + failure_handle = stack_depot_save(entries, nr_entries, GFP_KERNEL); +} + +static void init_page_pinner(void) +{ + if (!page_pinner_enabled) + return; + + register_failure_stack(); + static_branch_enable(&page_pinner_inited); +} + +struct page_ext_operations page_pinner_ops = { + .size = sizeof(struct page_pinner), + .need = need_page_pinner, + .init = init_page_pinner, +}; + +static inline struct page_pinner *get_page_pinner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_pinner_ops.offset; +} + +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_PINNER_STACK_DEPTH]; + depot_stack_handle_t handle; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); + handle = stack_depot_save(entries, nr_entries, flags); + if (!handle) + handle = failure_handle; + + return handle; +} + +static void capture_page_state(struct page *page, + struct captured_pinner *record) +{ + record->flags = page->flags; + record->mapping = page_mapping(page); + record->pfn = page_to_pfn(page); + record->count = page_count(page); + record->mapcount = page_mapcount(page); +} + +static void add_record(struct page_pinner_buffer *pp_buf, + struct captured_pinner *record) +{ + unsigned long flags; + unsigned int idx; + + spin_lock_irqsave(&pp_buf->lock, flags); + idx = pp_buf->index++; + pp_buf->index %= pp_buf_size; + pp_buf->buffer[idx] = *record; + spin_unlock_irqrestore(&pp_buf->lock, flags); +} + +void __free_page_pinner(struct page *page, unsigned int order) +{ + struct page_pinner *page_pinner; + struct page_ext *page_ext; + int i; + + /* free_page could be called before buffer is initialized */ + if (!pp_buffer.buffer) + return; + + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + + for (i = 0; i < (1 << order); i++) { + struct captured_pinner record; + + if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) + continue; + + page_pinner = get_page_pinner(page_ext); + /* record page free call path */ + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; + + record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + record.ts_usec = (u64)ktime_to_us(ktime_get_boottime()); + record.state = PP_FREE; + capture_page_state(page, &record); + + add_record(&pp_buffer, &record); + + atomic_set(&page_pinner->count, 0); + page_pinner->ts_usec = 0; + clear_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static ssize_t +print_page_pinner(char __user *buf, size_t count, struct captured_pinner *record) +{ + int ret; + unsigned long *entries; + unsigned int nr_entries; + char *kbuf; + + count = min_t(size_t, count, PAGE_SIZE); + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + if (record->state == PP_PUT) { + ret = snprintf(kbuf, count, "At least, pinned for %llu us\n", + record->elapsed); + } else { + u64 ts_usec = record->ts_usec; + unsigned long rem_usec = do_div(ts_usec, 1000000); + + ret = snprintf(kbuf, count, + "%s [%5lu.%06lu]\n", + record->state == PP_FREE ? "Freed at" : + "Failure detected at", + (unsigned long)ts_usec, rem_usec); + } + + if (ret >= count) + goto err; + + /* Print information relevant to grouping pages by mobility */ + ret += snprintf(kbuf + ret, count - ret, + "PFN 0x%lx Block %lu count %d mapcount %d mapping %pS Flags %#lx(%pGp)\n", + record->pfn, + record->pfn >> pageblock_order, + record->count, record->mapcount, + record->mapping, + record->flags, &record->flags); + + if (ret >= count) + goto err; + + nr_entries = stack_depot_fetch(record->handle, &entries); + ret += stack_trace_snprint(kbuf + ret, count - ret, entries, + nr_entries, 0); + if (ret >= count) + goto err; + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +void __page_pinner_failure_detect(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct page_pinner *page_pinner; + struct captured_pinner record; + u64 now; + + if (unlikely(!page_ext)) + return; + + if (test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) + return; + + now = (u64)ktime_to_us(ktime_get_boottime()); + page_pinner = get_page_pinner(page_ext); + if (!page_pinner->ts_usec) + page_pinner->ts_usec = now; + set_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags); + record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + record.ts_usec = now; + record.state = PP_FAIL_DETECTED; + capture_page_state(page, &record); + + add_record(&pp_buffer, &record); +} +EXPORT_SYMBOL_GPL(__page_pinner_failure_detect); + +void __page_pinner_put_page(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct page_pinner *page_pinner; + struct captured_pinner record; + u64 now, ts_usec; + + if (unlikely(!page_ext)) + return; + + if (!test_bit(PAGE_EXT_PINNER_MIGRATION_FAILED, &page_ext->flags)) + return; + + page_pinner = get_page_pinner(page_ext); + record.handle = save_stack(GFP_NOWAIT|__GFP_NOWARN); + now = (u64)ktime_to_us(ktime_get_boottime()); + ts_usec = page_pinner->ts_usec; + + if (now > ts_usec) + record.elapsed = now - ts_usec; + else + record.elapsed = 0; + record.state = PP_PUT; + capture_page_state(page, &record); + + add_record(&pp_buffer, &record); +} +EXPORT_SYMBOL_GPL(__page_pinner_put_page); + +static ssize_t read_buffer(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 tmp; + loff_t i, idx; + struct captured_pinner record; + unsigned long flags; + + if (!static_branch_unlikely(&failure_tracking)) + return -EINVAL; + + if (*ppos >= pp_buf_size) + return 0; + + i = *ppos; + *ppos = i + 1; + + /* + * reading the records in the reverse order with newest one + * being read first followed by older ones + */ + tmp = pp_buffer.index - 1 - i + pp_buf_size; + idx = do_div(tmp, pp_buf_size); + + spin_lock_irqsave(&pp_buffer.lock, flags); + record = pp_buffer.buffer[idx]; + spin_unlock_irqrestore(&pp_buffer.lock, flags); + if (!record.handle) + return 0; + + return print_page_pinner(buf, count, &record); +} + +static const struct file_operations proc_buffer_operations = { + .read = read_buffer, +}; + +static int failure_tracking_set(void *data, u64 val) +{ + bool on; + + on = (bool)val; + if (on) + static_branch_enable(&failure_tracking); + else + static_branch_disable(&failure_tracking); + return 0; +} + +static int failure_tracking_get(void *data, u64 *val) +{ + *val = static_branch_unlikely(&failure_tracking); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(failure_tracking_fops, + failure_tracking_get, + failure_tracking_set, "%llu\n"); + +static int buffer_size_set(void *data, u64 val) +{ + unsigned long flags; + struct captured_pinner *new, *old; + + new = kvmalloc_array(val, sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + + spin_lock_irqsave(&pp_buffer.lock, flags); + old = pp_buffer.buffer; + pp_buffer.buffer = new; + pp_buffer.index = 0; + pp_buf_size = val; + spin_unlock_irqrestore(&pp_buffer.lock, flags); + kvfree(old); + + return 0; +} + +static int buffer_size_get(void *data, u64 *val) +{ + *val = pp_buf_size; + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(buffer_size_fops, + buffer_size_get, + buffer_size_set, "%llu\n"); + +static int __init page_pinner_init(void) +{ + struct dentry *pp_debugfs_root; + + if (!static_branch_unlikely(&page_pinner_inited)) + return 0; + + pp_buffer.buffer = kvmalloc_array(pp_buf_size, sizeof(*pp_buffer.buffer), + GFP_KERNEL); + if (!pp_buffer.buffer) { + pr_info("page_pinner disabled due to failure of buffer allocation\n"); + return 1; + } + + spin_lock_init(&pp_buffer.lock); + pp_buffer.index = 0; + + pr_info("page_pinner enabled\n"); + + pp_debugfs_root = debugfs_create_dir("page_pinner", NULL); + + debugfs_create_file("buffer", 0444, + pp_debugfs_root, NULL, + &proc_buffer_operations); + + debugfs_create_file("failure_tracking", 0644, + pp_debugfs_root, NULL, + &failure_tracking_fops); + + debugfs_create_file("buffer_size", 0644, + pp_debugfs_root, NULL, + &buffer_size_fops); + return 0; +} +late_initcall(page_pinner_init)