android_kernel_xiaomi_sm8450/mm/mprotect.c
Peter Collingbourne b44e46bb04 FROMGIT: mm: improve mprotect(R|W) efficiency on pages referenced once
In the Scudo memory allocator [1] we would like to be able to detect
use-after-free vulnerabilities involving large allocations by issuing
mprotect(PROT_NONE) on the memory region used for the allocation when it
is deallocated.  Later on, after the memory region has been "quarantined"
for a sufficient period of time we would like to be able to use it for
another allocation by issuing mprotect(PROT_READ|PROT_WRITE).

Before this patch, after removing the write protection, any writes to the
memory region would result in page faults and entering the copy-on-write
code path, even in the usual case where the pages are only referenced by a
single PTE, harming performance unnecessarily.  Make it so that any pages
in anonymous mappings that are only referenced by a single PTE are
immediately made writable during the mprotect so that we can avoid the
page faults.

This program shows the critical syscall sequence that we intend to use in
the allocator:

  #include <string.h>
  #include <sys/mman.h>

  enum { kSize = 131072 };

  int main(int argc, char **argv) {
    char *addr = (char *)mmap(0, kSize, PROT_READ | PROT_WRITE,
                              MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
    for (int i = 0; i != 100000; ++i) {
      memset(addr, i, kSize);
      mprotect((void *)addr, kSize, PROT_NONE);
      mprotect((void *)addr, kSize, PROT_READ | PROT_WRITE);
    }
  }

The effect of this patch on the above program was measured on a
DragonBoard 845c by taking the median real time execution time of 10 runs.

Before: 2.94s
After:  0.66s

The effect was also measured using one of the microbenchmarks that we
normally use to benchmark the allocator [2], after modifying it to make
the appropriate mprotect calls [3].  With an allocation size of 131072
bytes to trigger the allocator's "large allocation" code path the
per-iteration time was measured as follows:

Before: 27450ns
After:   6010ns

This patch means that we do more work during the mprotect call itself in
exchange for less work when the pages are accessed.  In the worst case,
the pages are not accessed at all.  The effect of this patch in such cases
was measured using the following program:

  #include <string.h>
  #include <sys/mman.h>

  enum { kSize = 131072 };

  int main(int argc, char **argv) {
    char *addr = (char *)mmap(0, kSize, PROT_READ | PROT_WRITE,
                              MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
    memset(addr, 1, kSize);
    for (int i = 0; i != 100000; ++i) {
  #ifdef PAGE_FAULT
      memset(addr + (i * 4096) % kSize, i, 4096);
  #endif
      mprotect((void *)addr, kSize, PROT_NONE);
      mprotect((void *)addr, kSize, PROT_READ | PROT_WRITE);
    }
  }

With PAGE_FAULT undefined (0 pages touched after removing write
protection) the median real time execution time of 100 runs was measured
as follows:

Before: 0.330260s
After:  0.338836s

With PAGE_FAULT defined (1 page touched) the measurements were
as follows:

Before: 0.438048s
After:  0.355661s

So it seems that even with a single page fault the new approach is faster.

I saw similar results if I adjusted the programs to use a larger mapping
size.  With kSize = 1048576 I get these numbers with PAGE_FAULT undefined:

Before: 1.428988s
After:  1.512016s

i.e. around 5.5%.

And these with PAGE_FAULT defined:

Before: 1.518559s
After:  1.524417s

i.e. about the same.

What I think we may conclude from these results is that for smaller
mappings the advantage of the previous approach, although measurable, is
wiped out by a single page fault.  I think we may expect that there should
be at least one access resulting in a page fault (under the previous
approach) after making the pages writable, since the program presumably
made the pages writable for a reason.

For larger mappings we may guesstimate that the new approach wins if the
density of future page faults is > 0.4%.  But for the mappings that are
large enough for density to matter (not just the absolute number of page
faults) it doesn't seem like the increase in mprotect latency would be
very large relative to the total mprotect execution time.

Link: https://lkml.kernel.org/r/20210527190453.1259020-1-pcc@google.com
Link: https://linux-review.googlesource.com/id/I98d75ef90e20330c578871c87494d64b1df3f1b8
Link: [1] https://source.android.com/devices/tech/debug/scudo
Link: [2] https://cs.android.com/android/platform/superproject/+/master:bionic/benchmarks/stdlib_benchmark.cpp;l=53;drc=e8693e78711e8f45ccd2b610e4dbe0b94d551cc9
Link: [3] https://github.com/pcc/llvm-project/commit/scudo-mprotect-secondary2
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Kostya Kortchinsky <kostyak@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
(cherry picked from commit e2037f9c0c61ed6964bb1291292ae88f073a100c
 https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git akpm)
[pcc: squashed v4->v5 diff which appeared as a separate commit: ec7563ea9f6a470e9bb532b024ce29d9474daf24]
Change-Id: Ic3994b2ec914d3f62f95c1ef338986e350e69e36
Bug: 191165850
2021-06-15 19:33:15 +00:00

743 lines
18 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* mm/mprotect.c
*
* (C) Copyright 1994 Linus Torvalds
* (C) Copyright 2002 Christoph Hellwig
*
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
#include <linux/pagewalk.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/mempolicy.h>
#include <linux/personality.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/pkeys.h>
#include <linux/ksm.h>
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include "internal.h"
/* Determine whether we can avoid taking write faults for known dirty pages. */
static bool may_avoid_write_fault(pte_t pte, struct vm_area_struct *vma,
unsigned long cp_flags)
{
/*
* The dirty accountable bit indicates that we can always make the page
* writable regardless of the number of references.
*/
if (!(cp_flags & MM_CP_DIRTY_ACCT)) {
/* Otherwise, we must have exclusive access to the page. */
if (!(vma_is_anonymous(vma) && (vma->vm_flags & VM_WRITE)))
return false;
if (page_count(pte_page(pte)) != 1)
return false;
}
/*
* Don't do this optimization for clean pages as we need to be notified
* of the transition from clean to dirty.
*/
if (!pte_dirty(pte))
return false;
/* Same for softdirty. */
if (!pte_soft_dirty(pte) && (vma->vm_flags & VM_SOFTDIRTY))
return false;
/*
* For userfaultfd the user program needs to monitor write faults so we
* can't do this optimization.
*/
if (pte_uffd_wp(pte))
return false;
/*
* It is unclear whether this optimization can be done safely for NUMA
* pages.
*/
if (cp_flags & MM_CP_PROT_NUMA)
return false;
return true;
}
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
* Can be called with only the mmap_lock for reading by
* prot_numa so we must check the pmd isn't constantly
* changing from under us from pmd_none to pmd_trans_huge
* and/or the other way around.
*/
if (pmd_trans_unstable(pmd))
return 0;
/*
* The pmd points to a regular pte so the pmd can't change
* from under us even if the mmap_lock is only hold for
* reading.
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
atomic_read(&vma->vm_mm->mm_users) == 1)
target_node = numa_node_id();
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
bool preserve_write = prot_numa && pte_write(oldpte);
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
*/
if (prot_numa) {
struct page *page;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
page = vm_normal_page(vma, addr, oldpte);
if (!page || PageKsm(page))
continue;
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
page_mapcount(page) != 1)
continue;
/*
* While migration can move some dirty pages,
* it cannot move them all from MIGRATE_ASYNC
* context.
*/
if (page_is_file_lru(page) && PageDirty(page))
continue;
/*
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
*/
if (target_node == page_to_nid(page))
continue;
}
oldpte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_modify(oldpte, newprot);
if (preserve_write)
ptent = pte_mk_savedwrite(ptent);
if (uffd_wp) {
ptent = pte_wrprotect(ptent);
ptent = pte_mkuffd_wp(ptent);
} else if (uffd_wp_resolve) {
/*
* Leave the write bit to be handled
* by PF interrupt handler, then
* things like COW could be properly
* handled.
*/
ptent = pte_clear_uffd_wp(ptent);
}
if (may_avoid_write_fault(ptent, vma, cp_flags))
ptent = pte_mkwrite(ptent);
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
if (is_write_migration_entry(entry)) {
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
} else if (is_write_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation.
*/
make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
} else {
newpte = oldpte;
}
if (uffd_wp)
newpte = pte_swp_mkuffd_wp(newpte);
else if (uffd_wp_resolve)
newpte = pte_swp_clear_uffd_wp(newpte);
if (!pte_same(oldpte, newpte)) {
set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return pages;
}
/*
* Used when setting automatic NUMA hinting protection where it is
* critical that a numa hinting PMD is not confused with a bad PMD.
*/
static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
{
pmd_t pmdval = pmd_read_atomic(pmd);
/* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
barrier();
#endif
if (pmd_none(pmdval))
return 1;
if (pmd_trans_huge(pmdval))
return 0;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
return 1;
}
return 0;
}
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
struct mmu_notifier_range range;
range.start = 0;
pmd = pmd_offset(pud, addr);
do {
unsigned long this_pages;
next = pmd_addr_end(addr, end);
/*
* Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
* between pmd_trans_huge() and a pmd_none_or_clear_bad()
* check leading to a false positive and clearing.
* Hence, it's necessary to atomically read the PMD value
* for all the checks.
*/
if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
pmd_none_or_clear_bad_unless_trans_huge(pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
mmu_notifier_range_init(&range,
MMU_NOTIFY_PROTECTION_VMA, 0,
vma, vma->vm_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, cp_flags);
if (nr_ptes) {
if (nr_ptes == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
/* huge pmd was handled */
goto next;
}
}
/* fall through, the trans huge pmd just split */
}
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
cp_flags);
pages += this_pages;
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
if (range.start)
mmu_notifier_invalidate_range_end(&range);
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
}
static inline unsigned long change_pud_range(struct vm_area_struct *vma,
p4d_t *p4d, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
unsigned long pages = 0;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(vma, pud, addr, next, newprot,
cp_flags);
} while (pud++, addr = next, addr != end);
return pages;
}
static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
pgd_t *pgd, unsigned long addr, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
unsigned long pages = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
pages += change_pud_range(vma, p4d, addr, next, newprot,
cp_flags);
} while (p4d++, addr = next, addr != end);
return pages;
}
static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
inc_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_p4d_range(vma, pgd, addr, next, newprot,
cp_flags);
} while (pgd++, addr = next, addr != end);
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
dec_tlb_flush_pending(mm);
return pages;
}
unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
unsigned long pages;
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot,
cp_flags);
return pages;
}
static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
static int prot_none_test(unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
return 0;
}
static const struct mm_walk_ops prot_none_walk_ops = {
.pte_entry = prot_none_pte_entry,
.hugetlb_entry = prot_none_hugetlb_entry,
.test_walk = prot_none_test,
};
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
pgoff_t pgoff;
int error;
int dirty_accountable = 0;
if (newflags == oldflags) {
*pprev = vma;
return 0;
}
/*
* Do PROT_NONE PFN permission checks here when we can still
* bail out without undoing a lot of state. This is a rather
* uncommon case, so doesn't need to be very optimized.
*/
if (arch_has_pfn_modify_check() &&
(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & VM_ACCESS_FLAGS) == 0) {
pgprot_t new_pgprot = vm_get_page_prot(newflags);
error = walk_page_range(current->mm, start, end,
&prot_none_walk_ops, &new_pgprot);
if (error)
return error;
}
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again. hugetlb mapping were accounted for
* even if read-only so there is no need to account for them here
*/
if (newflags & VM_WRITE) {
/* Check space limits when area turns into data. */
if (!may_expand_vm(mm, newflags, nrpages) &&
may_expand_vm(mm, oldflags, nrpages))
return -ENOMEM;
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
newflags |= VM_ACCOUNT;
}
}
/*
* First try to merge with previous and/or next vma.
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
goto success;
}
*pprev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
goto fail;
}
if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
goto fail;
}
success:
/*
* vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, newflags);
dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
vm_write_end(vma);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
* fault on access.
*/
if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
(newflags & VM_WRITE)) {
populate_vma_page_range(vma, start, end, NULL);
}
vm_stat_account(mm, oldflags, -nrpages);
vm_stat_account(mm, newflags, nrpages);
perf_event_mmap(vma);
return 0;
fail:
vm_unacct_memory(charged);
return error;
}
/*
* pkey==-1 when doing a legacy mprotect()
*/
static int do_mprotect_pkey(unsigned long start, size_t len,
unsigned long prot, int pkey)
{
unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
start = untagged_addr(start);
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
if (start & ~PAGE_MASK)
return -EINVAL;
if (!len)
return 0;
len = PAGE_ALIGN(len);
end = start + len;
if (end <= start)
return -ENOMEM;
if (!arch_validate_prot(prot, start))
return -EINVAL;
reqprot = prot;
if (mmap_write_lock_killable(current->mm))
return -EINTR;
/*
* If userspace did not allocate the pkey, do not let
* them use it here.
*/
error = -EINVAL;
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
vma = find_vma(current->mm, start);
error = -ENOMEM;
if (!vma)
goto out;
prev = vma->vm_prev;
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
start = vma->vm_start;
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
} else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
end = vma->vm_end;
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSUP))
goto out;
}
}
if (start > vma->vm_start)
prev = vma;
for (nstart = start ; ; ) {
unsigned long mask_off_old_flags;
unsigned long newflags;
int new_vma_pkey;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
/* Does the application expect PROT_READ to imply PROT_EXEC */
if (rier && (vma->vm_flags & VM_MAYEXEC))
prot |= PROT_EXEC;
/*
* Each mprotect() call explicitly passes r/w/x permissions.
* If a permission is not passed to mprotect(), it must be
* cleared from the VMA.
*/
mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
VM_FLAGS_CLEAR;
new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
newflags = calc_vm_prot_bits(prot, new_vma_pkey);
newflags |= (vma->vm_flags & ~mask_off_old_flags);
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
error = -EACCES;
goto out;
}
/* Allow architectures to sanity-check the new flags */
if (!arch_validate_flags(newflags)) {
error = -EINVAL;
goto out;
}
error = security_file_mprotect(vma, reqprot, prot);
if (error)
goto out;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
nstart = tmp;
if (nstart < prev->vm_end)
nstart = prev->vm_end;
if (nstart >= end)
goto out;
vma = prev->vm_next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
goto out;
}
prot = reqprot;
}
out:
mmap_write_unlock(current->mm);
return error;
}
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
return do_mprotect_pkey(start, len, prot, -1);
}
#ifdef CONFIG_ARCH_HAS_PKEYS
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
unsigned long, prot, int, pkey)
{
return do_mprotect_pkey(start, len, prot, pkey);
}
SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
{
int pkey;
int ret;
/* No flags supported yet. */
if (flags)
return -EINVAL;
/* check for unsupported init values */
if (init_val & ~PKEY_ACCESS_MASK)
return -EINVAL;
mmap_write_lock(current->mm);
pkey = mm_pkey_alloc(current->mm);
ret = -ENOSPC;
if (pkey == -1)
goto out;
ret = arch_set_user_pkey_access(current, pkey, init_val);
if (ret) {
mm_pkey_free(current->mm, pkey);
goto out;
}
ret = pkey;
out:
mmap_write_unlock(current->mm);
return ret;
}
SYSCALL_DEFINE1(pkey_free, int, pkey)
{
int ret;
mmap_write_lock(current->mm);
ret = mm_pkey_free(current->mm, pkey);
mmap_write_unlock(current->mm);
/*
* We could provie warnings or errors if any VMA still
* has the pkey set here.
*/
return ret;
}
#endif /* CONFIG_ARCH_HAS_PKEYS */