diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 72f2b974b6d1..d9c35eb3066a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -289,6 +289,12 @@ do not want to use tracing_snapshot_alloc() as it needs to be done where GFP_KERNEL allocations are allowed. + allow_file_spec_access + Allow speculative faults on file backed pages. + Speculative faults are enabled only for those vm_ops + that implement and return true for allow_speculation + callback. + allow_mismatched_32bit_el0 [ARM64] Allow execve() of 32-bit applications and setting of the PER_LINUX32 personality on systems where only a strict diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0480e07c60b2..05c9813d6b22 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -745,6 +745,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .allow_speculation = filemap_allow_speculation, +#endif }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 471a6ff0c937..453bd12574c7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -171,6 +171,9 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .allow_speculation = filemap_allow_speculation, +#endif }; static int get_parent_ino(struct inode *inode, nid_t *pino) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5093bc2beac8..f9d07145d1af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -647,6 +647,10 @@ struct vm_operations_struct { struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + bool (*allow_speculation)(void); +#endif + ANDROID_KABI_RESERVE(1); ANDROID_KABI_RESERVE(2); ANDROID_KABI_RESERVE(3); @@ -2775,6 +2779,9 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern bool filemap_allow_speculation(void); +#endif /* mm/page-writeback.c */ int __must_check write_one_page(struct page *page); @@ -3339,6 +3346,7 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, #endif extern int sysctl_nr_trim_pages; +extern bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 0e73f8a955a8..ca7fe947e187 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -126,7 +126,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, SWAP_RA_HIT, #endif #ifdef CONFIG_SPECULATIVE_PAGE_FAULT - SPECULATIVE_PGFAULT, + SPECULATIVE_PGFAULT_ANON, + SPECULATIVE_PGFAULT_FILE, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/filemap.c b/mm/filemap.c index 91cc37ae2d8c..ae831c96b5fc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2864,6 +2864,11 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) } if (pmd_none(*vmf->pmd)) { + if (vmf->flags & FAULT_FLAG_SPECULATIVE) { + unlock_page(page); + put_page(page); + return true; + } vmf->ptl = pmd_lock(mm, vmf->pmd); if (likely(pmd_none(*vmf->pmd))) { mm_inc_nr_ptes(mm); @@ -2942,6 +2947,14 @@ static inline struct page *next_map_page(struct address_space *mapping, mapping, xas, end_pgoff); } +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +bool filemap_allow_speculation(void) +{ + return true; +} +EXPORT_SYMBOL_GPL(filemap_allow_speculation); +#endif + vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { @@ -2961,12 +2974,22 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; if (filemap_map_pmd(vmf, head)) { + if (pmd_none(*vmf->pmd) && + vmf->flags & FAULT_FLAG_SPECULATIVE) { + ret = VM_FAULT_RETRY; + goto out; + } + ret = VM_FAULT_NOPAGE; goto out; } addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + if (!pte_map_lock_addr(vmf, addr)) { + ret = VM_FAULT_RETRY; + goto out; + } + do { page = find_subpage(head, xas.xa_index); if (PageHWPoison(page)) @@ -3033,6 +3056,9 @@ const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .allow_speculation = filemap_allow_speculation, +#endif }; /* This is used for a general mmap of a disk file */ diff --git a/mm/memory.c b/mm/memory.c index 2fe2bf2962dd..46ed24f10475 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2638,7 +2638,7 @@ static bool pte_spinlock(struct vm_fault *vmf) return ret; } -static bool pte_map_lock(struct vm_fault *vmf) +static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr) { bool ret = false; pte_t *pte; @@ -2647,12 +2647,6 @@ static bool pte_map_lock(struct vm_fault *vmf) pmd_t pmdval; #endif - if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - return true; - } - /* * The first vma_has_changed() guarantees the page-tables are still * valid, having IRQs disabled ensures they stay around, hence the @@ -2662,7 +2656,7 @@ static bool pte_map_lock(struct vm_fault *vmf) */ local_irq_disable(); if (vma_has_changed(vmf)) { - trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address); + trace_spf_vma_changed(_RET_IP_, vmf->vma, addr); goto out; } @@ -2673,7 +2667,7 @@ static bool pte_map_lock(struct vm_fault *vmf) */ pmdval = READ_ONCE(*vmf->pmd); if (!pmd_same(pmdval, vmf->orig_pmd)) { - trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address); + trace_spf_pmd_changed(_RET_IP_, vmf->vma, addr); goto out; } #endif @@ -2686,16 +2680,16 @@ static bool pte_map_lock(struct vm_fault *vmf) * Since we are in a speculative patch, accept it could fail */ ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - pte = pte_offset_map(vmf->pmd, vmf->address); + pte = pte_offset_map(vmf->pmd, addr); if (unlikely(!spin_trylock(ptl))) { pte_unmap(pte); - trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address); + trace_spf_pte_lock(_RET_IP_, vmf->vma, addr); goto out; } if (vma_has_changed(vmf)) { pte_unmap_unlock(pte, ptl); - trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address); + trace_spf_vma_changed(_RET_IP_, vmf->vma, addr); goto out; } @@ -2706,6 +2700,82 @@ static bool pte_map_lock(struct vm_fault *vmf) local_irq_enable(); return ret; } + +static bool pte_map_lock(struct vm_fault *vmf) +{ + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + return true; + } + + return __pte_map_lock_speculative(vmf, vmf->address); +} + +bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr) +{ + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + addr, &vmf->ptl); + return true; + } + + return __pte_map_lock_speculative(vmf, addr); +} + +static bool __read_mostly allow_file_spec_access; +static int __init allow_file_spec_access_setup(char *str) +{ + allow_file_spec_access = true; + return 1; +} +__setup("allow_file_spec_access", allow_file_spec_access_setup); + +static bool vmf_allows_speculation(struct vm_fault *vmf) +{ + if (vma_is_anonymous(vmf->vma)) { + /* + * __anon_vma_prepare() requires the mmap_sem to be held + * because vm_next and vm_prev must be safe. This can't be + * guaranteed in the speculative path. + */ + if (!vmf->vma->anon_vma) { + trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address); + return false; + } + return true; + } + + if (!allow_file_spec_access) { + /* + * Can't call vm_ops service has we don't know what they would + * do with the VMA. + * This include huge page from hugetlbfs. + */ + trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address); + return false; + } + + if (!(vmf->vma->vm_flags & VM_SHARED) && + (vmf->flags & FAULT_FLAG_WRITE) && + !vmf->vma->anon_vma) { + /* + * non-anonymous private COW without anon_vma. + * See above. + */ + trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address); + return false; + } + + if (vmf->vma->vm_ops->allow_speculation && + vmf->vma->vm_ops->allow_speculation()) { + return true; + } + + trace_spf_vma_notsup(_RET_IP_, vmf->vma, vmf->address); + return false; +} + #else static inline bool pte_spinlock(struct vm_fault *vmf) { @@ -2720,6 +2790,18 @@ static inline bool pte_map_lock(struct vm_fault *vmf) vmf->address, &vmf->ptl); return true; } + +inline bool pte_map_lock_addr(struct vm_fault *vmf, unsigned long addr) +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + addr, &vmf->ptl); + return true; +} + +static inline bool vmf_allows_speculation(struct vm_fault *vmf) +{ + return false; +} #endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ /* @@ -4496,6 +4578,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; + vm_fault_t ret = 0; if (unlikely(pmd_none(*vmf->pmd))) { /* @@ -4559,7 +4642,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); - else if (vmf->flags & FAULT_FLAG_SPECULATIVE) + else if ((vmf->flags & FAULT_FLAG_SPECULATIVE) && + !vmf_allows_speculation(vmf)) return VM_FAULT_RETRY; else return do_fault(vmf); @@ -4591,6 +4675,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) goto unlock; + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + ret = VM_FAULT_RETRY; /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. @@ -4602,7 +4688,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; + return ret; } /* @@ -4794,6 +4880,7 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, .pgoff = linear_page_index(vma, address), .vma = vma, .gfp_mask = __get_fault_gfp_mask(vma), + .flags = flags, }; #ifdef CONFIG_NUMA struct mempolicy *pol; @@ -4815,25 +4902,8 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, return VM_FAULT_RETRY; } - /* - * Can't call vm_ops service has we don't know what they would do - * with the VMA. - * This include huge page from hugetlbfs. - */ - if (vmf.vma->vm_ops) { - trace_spf_vma_notsup(_RET_IP_, vmf.vma, address); + if (!vmf_allows_speculation(&vmf)) return VM_FAULT_RETRY; - } - - /* - * __anon_vma_prepare() requires the mmap_sem to be held - * because vm_next and vm_prev must be safe. This can't be guaranteed - * in the speculative path. - */ - if (unlikely(!vmf.vma->anon_vma)) { - trace_spf_vma_notsup(_RET_IP_, vmf.vma, address); - return VM_FAULT_RETRY; - } vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags); vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot); @@ -4964,8 +5034,12 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, ret = handle_pte_fault(&vmf); mem_cgroup_exit_user_fault(); - if (ret != VM_FAULT_RETRY) - count_vm_event(SPECULATIVE_PGFAULT); + if (ret != VM_FAULT_RETRY) { + if (vma_is_anonymous(vmf.vma)) + count_vm_event(SPECULATIVE_PGFAULT_ANON); + else + count_vm_event(SPECULATIVE_PGFAULT_FILE); + } /* * The task may have entered a memcg OOM situation but diff --git a/mm/nommu.c b/mm/nommu.c index ae8f15b106f8..f8f7f0e9947b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1676,6 +1676,14 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +bool filemap_allow_speculation(void) +{ + BUG(); + return false; +} +#endif + int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { diff --git a/mm/shmem.c b/mm/shmem.c index d80be19d9688..9774c91b1531 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3955,6 +3955,9 @@ static const struct vm_operations_struct shmem_vm_ops = { .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + .allow_speculation = filemap_allow_speculation, +#endif }; int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/vmstat.c b/mm/vmstat.c index 932928653b21..80c1e0a0f094 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1353,7 +1353,8 @@ const char * const vmstat_text[] = { "swap_ra_hit", #endif #ifdef CONFIG_SPECULATIVE_PAGE_FAULT - "speculative_pgfault" + "speculative_pgfault", + "speculative_pgfault_file" #endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ };