I found a race condition triggering VM_BUG_ON() in freeze_page(), when running a testcase with 3 processes: - process 1: keep writing thp, - process 2: keep clearing soft-dirty bits from virtual address of process 1 - process 3: call migratepages for process 1, The kernel message is like this: kernel BUG at /src/linux-dev/mm/huge_memory.c:3096! invalid opcode: 0000 [#1] SMP Modules linked in: cfg80211 rfkill crc32c_intel ppdev serio_raw pcspkr virtio_balloon virtio_console parport_pc parport pvpanic acpi_cpufreq tpm_tis tpm i2c_piix4 virtio_blk virtio_net ata_generic pata_acpi floppy virtio_pci virtio_ring virtio CPU: 0 PID: 28863 Comm: migratepages Not tainted 4.6.0-v4.6-160602-0827-+ #2 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880037320000 ti: ffff88007cdd0000 task.ti: ffff88007cdd0000 RIP: 0010:[<ffffffff811f8e06>] [<ffffffff811f8e06>] split_huge_page_to_list+0x496/0x590 RSP: 0018:ffff88007cdd3b70 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff88007c7b88c0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000700000200 RDI: ffffea0003188000 RBP: ffff88007cdd3bb8 R08: 0000000000000001 R09: 00003ffffffff000 R10: ffff880000000000 R11: ffffc000001fffff R12: ffffea0003188000 R13: ffffea0003188000 R14: 0000000000000000 R15: 0400000000000080 FS: 00007f8ec241d740(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8ec1f3ed20 CR3: 000000003707b000 CR4: 00000000000006f0 Call Trace: ? list_del+0xd/0x30 queue_pages_pte_range+0x4d1/0x590 __walk_page_range+0x204/0x4e0 walk_page_range+0x71/0xf0 queue_pages_range+0x75/0x90 ? queue_pages_hugetlb+0x190/0x190 ? new_node_page+0xc0/0xc0 ? change_prot_numa+0x40/0x40 migrate_to_node+0x71/0xd0 do_migrate_pages+0x1c3/0x210 SyS_migrate_pages+0x261/0x290 entry_SYSCALL_64_fastpath+0x1a/0xa4 Code: e8 b0 87 fb ff 0f 0b 48 c7 c6 30 32 9f 81 e8 a2 87 fb ff 0f 0b 48 c7 c6 b8 46 9f 81 e8 94 87 fb ff 0f 0b 85 c0 0f 84 3e fd ff ff <0f> 0b 85 c0 0f 85 a6 00 00 00 48 8b 75 c0 4c 89 f7 41 be f0 ff RIP split_huge_page_to_list+0x496/0x590 I'm not sure of the full scenario of the reproduction, but my debug showed that split_huge_pmd_address(freeze=true) returned without running main code of pmd splitting because pmd_present(*pmd) in precheck somehow returned 0. If this happens, the subsequent try_to_unmap() fails and returns non-zero (because page_mapcount() still > 0), and finally VM_BUG_ON() fires. This patch tries to fix it by prechecking pmd state inside ptl. Link: http://lkml.kernel.org/r/1466990929-7452-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
223 lines
6.7 KiB
C
223 lines
6.7 KiB
C
#ifndef _LINUX_HUGE_MM_H
|
|
#define _LINUX_HUGE_MM_H
|
|
|
|
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd,
|
|
unsigned int flags);
|
|
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
|
struct vm_area_struct *vma);
|
|
extern void huge_pmd_set_accessed(struct mm_struct *mm,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd,
|
|
pmd_t orig_pmd, int dirty);
|
|
extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long address, pmd_t *pmd,
|
|
pmd_t orig_pmd);
|
|
extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
|
|
unsigned long addr,
|
|
pmd_t *pmd,
|
|
unsigned int flags);
|
|
extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
|
|
struct vm_area_struct *vma,
|
|
pmd_t *pmd, unsigned long addr, unsigned long next);
|
|
extern int zap_huge_pmd(struct mmu_gather *tlb,
|
|
struct vm_area_struct *vma,
|
|
pmd_t *pmd, unsigned long addr);
|
|
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned char *vec);
|
|
extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
|
|
unsigned long new_addr, unsigned long old_end,
|
|
pmd_t *old_pmd, pmd_t *new_pmd);
|
|
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long addr, pgprot_t newprot,
|
|
int prot_numa);
|
|
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
|
pfn_t pfn, bool write);
|
|
enum transparent_hugepage_flag {
|
|
TRANSPARENT_HUGEPAGE_FLAG,
|
|
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
|
|
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
|
|
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
|
|
#ifdef CONFIG_DEBUG_VM
|
|
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
|
|
#endif
|
|
};
|
|
|
|
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
|
|
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
|
|
pmd_t *pmd, int flags);
|
|
|
|
#define HPAGE_PMD_SHIFT PMD_SHIFT
|
|
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
|
|
#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
|
|
|
|
extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
|
|
|
|
#define transparent_hugepage_enabled(__vma) \
|
|
((transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_FLAG) || \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \
|
|
((__vma)->vm_flags & VM_HUGEPAGE))) && \
|
|
!((__vma)->vm_flags & VM_NOHUGEPAGE) && \
|
|
!is_vma_temporary_stack(__vma))
|
|
#define transparent_hugepage_use_zero_page() \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
|
|
#ifdef CONFIG_DEBUG_VM
|
|
#define transparent_hugepage_debug_cow() \
|
|
(transparent_hugepage_flags & \
|
|
(1<<TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG))
|
|
#else /* CONFIG_DEBUG_VM */
|
|
#define transparent_hugepage_debug_cow() 0
|
|
#endif /* CONFIG_DEBUG_VM */
|
|
|
|
extern unsigned long transparent_hugepage_flags;
|
|
|
|
extern void prep_transhuge_page(struct page *page);
|
|
extern void free_transhuge_page(struct page *page);
|
|
|
|
int split_huge_page_to_list(struct page *page, struct list_head *list);
|
|
static inline int split_huge_page(struct page *page)
|
|
{
|
|
return split_huge_page_to_list(page, NULL);
|
|
}
|
|
void deferred_split_huge_page(struct page *page);
|
|
|
|
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long address, bool freeze, struct page *page);
|
|
|
|
#define split_huge_pmd(__vma, __pmd, __address) \
|
|
do { \
|
|
pmd_t *____pmd = (__pmd); \
|
|
if (pmd_trans_huge(*____pmd) \
|
|
|| pmd_devmap(*____pmd)) \
|
|
__split_huge_pmd(__vma, __pmd, __address, \
|
|
false, NULL); \
|
|
} while (0)
|
|
|
|
|
|
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
|
|
bool freeze, struct page *page);
|
|
|
|
extern int hugepage_madvise(struct vm_area_struct *vma,
|
|
unsigned long *vm_flags, int advice);
|
|
extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
long adjust_next);
|
|
extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
|
|
struct vm_area_struct *vma);
|
|
/* mmap_sem must be held on entry */
|
|
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
|
|
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
|
|
return __pmd_trans_huge_lock(pmd, vma);
|
|
else
|
|
return NULL;
|
|
}
|
|
static inline int hpage_nr_pages(struct page *page)
|
|
{
|
|
if (unlikely(PageTransHuge(page)))
|
|
return HPAGE_PMD_NR;
|
|
return 1;
|
|
}
|
|
|
|
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
|
|
|
|
extern struct page *huge_zero_page;
|
|
|
|
static inline bool is_huge_zero_page(struct page *page)
|
|
{
|
|
return ACCESS_ONCE(huge_zero_page) == page;
|
|
}
|
|
|
|
static inline bool is_huge_zero_pmd(pmd_t pmd)
|
|
{
|
|
return is_huge_zero_page(pmd_page(pmd));
|
|
}
|
|
|
|
struct page *get_huge_zero_page(void);
|
|
void put_huge_zero_page(void);
|
|
|
|
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
|
|
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
|
|
#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
|
|
|
|
#define hpage_nr_pages(x) 1
|
|
|
|
#define transparent_hugepage_enabled(__vma) 0
|
|
|
|
#define transparent_hugepage_flags 0UL
|
|
static inline int
|
|
split_huge_page_to_list(struct page *page, struct list_head *list)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int split_huge_page(struct page *page)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void deferred_split_huge_page(struct page *page) {}
|
|
#define split_huge_pmd(__vma, __pmd, __address) \
|
|
do { } while (0)
|
|
|
|
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
|
|
unsigned long address, bool freeze, struct page *page) {}
|
|
|
|
static inline int hugepage_madvise(struct vm_area_struct *vma,
|
|
unsigned long *vm_flags, int advice)
|
|
{
|
|
BUG();
|
|
return 0;
|
|
}
|
|
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
long adjust_next)
|
|
{
|
|
}
|
|
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline bool is_huge_zero_page(struct page *page)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void put_huge_zero_page(void)
|
|
{
|
|
BUILD_BUG();
|
|
}
|
|
|
|
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
|
|
unsigned long addr, pmd_t *pmd, int flags)
|
|
{
|
|
return NULL;
|
|
}
|
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
|
|
|
#endif /* _LINUX_HUGE_MM_H */
|