b6577b7604
The padding VMA is never inserted into the VMA tree; therefore we don't need to have the mmap lock in exclusive mode to modify it. Test: v2/android-gki/ack_platform_integration_main_cf_arm64_boot_test on kernel_virt_debug_aarch64 Bug: 346741763 Bug: 352463814 Change-Id: I4ca3ed22dab45b6bb895cb41c5c6792344188b61 Signed-off-by: Kalesh Singh <kaleshsingh@google.com> (cherry picked from commit a8b3ebe7f90afe0ad0d9a8b0b90f266da06a7007)
426 lines
11 KiB
C
426 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Page Size Migration
|
|
*
|
|
* This file contains the core logic of mitigations to ensure
|
|
* app compatibility during the transition from 4kB to 16kB
|
|
* page size in Android.
|
|
*
|
|
* Copyright (c) 2024, Google LLC.
|
|
* Author: Kalesh Singh <kaleshsingh@goole.com>
|
|
*/
|
|
|
|
#include <linux/pgsize_migration.h>
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/jump_label.h>
|
|
#include <linux/kobject.h>
|
|
#include <linux/kstrtox.h>
|
|
#include <linux/sched/task_stack.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/string.h>
|
|
#include <linux/sysfs.h>
|
|
|
|
typedef void (*show_pad_maps_fn) (struct seq_file *m, struct vm_area_struct *vma);
|
|
typedef int (*show_pad_smaps_fn) (struct seq_file *m, void *v);
|
|
|
|
#ifdef CONFIG_64BIT
|
|
#if PAGE_SIZE == SZ_4K
|
|
DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled);
|
|
|
|
#define is_pgsize_migration_enabled() (static_branch_likely(&pgsize_migration_enabled))
|
|
#else /* PAGE_SIZE != SZ_4K */
|
|
DEFINE_STATIC_KEY_FALSE(pgsize_migration_enabled);
|
|
|
|
#define is_pgsize_migration_enabled() (static_branch_unlikely(&pgsize_migration_enabled))
|
|
#endif /* PAGE_SIZE == SZ_4K */
|
|
|
|
static ssize_t show_pgsize_migration_enabled(struct kobject *kobj,
|
|
struct kobj_attribute *attr,
|
|
char *buf)
|
|
{
|
|
if (is_pgsize_migration_enabled())
|
|
return sprintf(buf, "%d\n", 1);
|
|
else
|
|
return sprintf(buf, "%d\n", 0);
|
|
}
|
|
|
|
static ssize_t store_pgsize_migration_enabled(struct kobject *kobj,
|
|
struct kobj_attribute *attr,
|
|
const char *buf, size_t n)
|
|
{
|
|
unsigned long val;
|
|
|
|
/* Migration is only applicable to 4kB kernels */
|
|
if (PAGE_SIZE != SZ_4K)
|
|
return n;
|
|
|
|
if (kstrtoul(buf, 10, &val))
|
|
return -EINVAL;
|
|
|
|
if (val > 1)
|
|
return -EINVAL;
|
|
|
|
if (val == 1)
|
|
static_branch_enable(&pgsize_migration_enabled);
|
|
else if (val == 0)
|
|
static_branch_disable(&pgsize_migration_enabled);
|
|
|
|
return n;
|
|
}
|
|
|
|
static struct kobj_attribute pgsize_migration_enabled_attr = __ATTR(
|
|
enabled,
|
|
0644,
|
|
show_pgsize_migration_enabled,
|
|
store_pgsize_migration_enabled
|
|
);
|
|
|
|
static struct attribute *pgsize_migration_attrs[] = {
|
|
&pgsize_migration_enabled_attr.attr,
|
|
NULL
|
|
};
|
|
|
|
static struct attribute_group pgsize_migration_attr_group = {
|
|
.name = "pgsize_migration",
|
|
.attrs = pgsize_migration_attrs,
|
|
};
|
|
|
|
/**
|
|
* What: /sys/kernel/mm/pgsize_migration/enabled
|
|
* Date: April 2024
|
|
* KernelVersion: v5.4+ (GKI kernels)
|
|
* Contact: Kalesh Singh <kaleshsingh@google.com>
|
|
* Description: /sys/kernel/mm/pgsize_migration/enabled
|
|
* allows for userspace to turn on or off page size
|
|
* migration mitigations necessary for app compatibility
|
|
* during Android's transition from 4kB to 16kB page size.
|
|
* Such mitigations include preserving /proc/<pid>/[s]maps
|
|
* output as if there was no segment extension by the
|
|
* dynamic loader; and preventing fault around in the padding
|
|
* sections of ELF LOAD segment mappings.
|
|
* Users: Bionic's dynamic linker
|
|
*/
|
|
static int __init init_pgsize_migration(void)
|
|
{
|
|
if (sysfs_create_group(mm_kobj, &pgsize_migration_attr_group))
|
|
pr_err("pgsize_migration: failed to create sysfs group\n");
|
|
|
|
return 0;
|
|
};
|
|
late_initcall(init_pgsize_migration);
|
|
|
|
#if PAGE_SIZE == SZ_4K
|
|
void vma_set_pad_pages(struct vm_area_struct *vma,
|
|
unsigned long nr_pages)
|
|
{
|
|
if (!is_pgsize_migration_enabled())
|
|
return;
|
|
|
|
/*
|
|
* Usually to modify vm_flags we need to take exclusive mmap_lock but here
|
|
* only have the lock in read mode, to avoid all DONTNEED/DONTNEED_LOCKED
|
|
* calls needing the write lock.
|
|
*
|
|
* A race to the flags update can only happen with another MADV_DONTNEED on
|
|
* the same process and same range (VMA).
|
|
*
|
|
* In practice, this specific scenario is not possible because the action that
|
|
* could cause it is usually performed at most once per VMA and only by the
|
|
* dynamic linker.
|
|
*
|
|
* Forego protection for this case, to avoid penalties in the common cases.
|
|
*/
|
|
__vm_flags_mod(vma, 0, VM_PAD_MASK);
|
|
__vm_flags_mod(vma, nr_pages << VM_PAD_SHIFT, 0);
|
|
}
|
|
|
|
unsigned long vma_pad_pages(struct vm_area_struct *vma)
|
|
{
|
|
if (!is_pgsize_migration_enabled())
|
|
return 0;
|
|
|
|
return vma->vm_flags >> VM_PAD_SHIFT;
|
|
}
|
|
|
|
static __always_inline bool str_has_suffix(const char *str, const char *suffix)
|
|
{
|
|
size_t str_len = strlen(str);
|
|
size_t suffix_len = strlen(suffix);
|
|
|
|
if (str_len < suffix_len)
|
|
return false;
|
|
|
|
return !strncmp(str + str_len - suffix_len, suffix, suffix_len);
|
|
}
|
|
|
|
/*
|
|
* The dynamic linker, or interpreter, operates within the process context
|
|
* of the binary that necessitated dynamic linking.
|
|
*
|
|
* Consequently, process context identifiers; like PID, comm, ...; cannot
|
|
* be used to differentiate whether the execution context belongs to the
|
|
* dynamic linker or not.
|
|
*
|
|
* linker_ctx() deduces whether execution is currently in the dynamic linker's
|
|
* context by correlating the current userspace instruction pointer with the
|
|
* VMAs of the current task.
|
|
*
|
|
* Returns true if in linker context, otherwise false.
|
|
*
|
|
* Caller must hold mmap lock in read mode.
|
|
*/
|
|
static inline bool linker_ctx(void)
|
|
{
|
|
struct pt_regs *regs = task_pt_regs(current);
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
struct file *file;
|
|
|
|
if (!regs)
|
|
return false;
|
|
|
|
vma = find_vma(mm, instruction_pointer(regs));
|
|
|
|
/* Current execution context, the VMA must be present */
|
|
BUG_ON(!vma);
|
|
|
|
file = vma->vm_file;
|
|
if (!file)
|
|
return false;
|
|
|
|
if ((vma->vm_flags & VM_EXEC)) {
|
|
char buf[64];
|
|
const int bufsize = sizeof(buf);
|
|
char *path;
|
|
|
|
memset(buf, 0, bufsize);
|
|
path = d_path(&file->f_path, buf, bufsize);
|
|
|
|
/*
|
|
* Depending on interpreter requested, valid paths could be any of:
|
|
* 1. /system/bin/bootstrap/linker64
|
|
* 2. /system/bin/linker64
|
|
* 3. /apex/com.android.runtime/bin/linker64
|
|
*
|
|
* Check the base name (linker64).
|
|
*/
|
|
if (!strcmp(kbasename(path), "linker64"))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Saves the number of padding pages for an ELF segment mapping
|
|
* in vm_flags.
|
|
*
|
|
* The number of padding pages is deduced from the madvise DONTNEED range [start, end)
|
|
* if the following conditions are met:
|
|
* 1) The range is enclosed by a single VMA
|
|
* 2) The range ends at the end address of the VMA
|
|
* 3) The range starts at an address greater than the start address of the VMA
|
|
* 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES.
|
|
* 5) The VMA is a file backed VMA.
|
|
* 6) The file backing the VMA is a shared library (*.so)
|
|
* 7) The madvise was requested by bionic's dynamic linker.
|
|
*/
|
|
void madvise_vma_pad_pages(struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long nr_pad_pages;
|
|
|
|
if (!is_pgsize_migration_enabled())
|
|
return;
|
|
|
|
/*
|
|
* If the madvise range is it at the end of the file save the number of
|
|
* pages in vm_flags (only need 4 bits are needed for up to 64kB aligned ELFs).
|
|
*/
|
|
if (start <= vma->vm_start || end != vma->vm_end)
|
|
return;
|
|
|
|
nr_pad_pages = (end - start) >> PAGE_SHIFT;
|
|
|
|
if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES)
|
|
return;
|
|
|
|
/* Only handle this for file backed VMAs */
|
|
if (!vma->vm_file)
|
|
return;
|
|
|
|
/* Limit this to only shared libraries (*.so) */
|
|
if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so"))
|
|
return;
|
|
|
|
/* Only bionic's dynamic linker needs to hint padding pages. */
|
|
if (!linker_ctx())
|
|
return;
|
|
|
|
vma_set_pad_pages(vma, nr_pad_pages);
|
|
}
|
|
|
|
static const char *pad_vma_name(struct vm_area_struct *vma)
|
|
{
|
|
return "[page size compat]";
|
|
}
|
|
|
|
static const struct vm_operations_struct pad_vma_ops = {
|
|
.name = pad_vma_name,
|
|
};
|
|
|
|
/*
|
|
* Returns a new VMA representing the padding in @vma, if no padding
|
|
* in @vma returns NULL.
|
|
*/
|
|
struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
|
|
{
|
|
struct vm_area_struct *pad;
|
|
|
|
if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
|
|
return NULL;
|
|
|
|
pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
|
|
|
|
memcpy(pad, vma, sizeof(struct vm_area_struct));
|
|
|
|
/* Remove file */
|
|
pad->vm_file = NULL;
|
|
|
|
/* Add vm_ops->name */
|
|
pad->vm_ops = &pad_vma_ops;
|
|
|
|
/* Adjust the start to begin at the start of the padding section */
|
|
pad->vm_start = VMA_PAD_START(pad);
|
|
|
|
/*
|
|
* The below modifications to vm_flags don't need mmap write lock,
|
|
* since, pad does not belong to the VMA tree.
|
|
*/
|
|
/* Make the pad vma PROT_NONE */
|
|
__vm_flags_mod(pad, 0, VM_READ|VM_WRITE|VM_EXEC);
|
|
/* Remove padding bits */
|
|
__vm_flags_mod(pad, 0, VM_PAD_MASK);
|
|
|
|
return pad;
|
|
}
|
|
|
|
/*
|
|
* Returns a new VMA exclusing the padding from @vma; if no padding in
|
|
* @vma returns @vma.
|
|
*/
|
|
struct vm_area_struct *get_data_vma(struct vm_area_struct *vma)
|
|
{
|
|
struct vm_area_struct *data;
|
|
|
|
if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
|
|
return vma;
|
|
|
|
data = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
|
|
|
|
memcpy(data, vma, sizeof(struct vm_area_struct));
|
|
|
|
/* Adjust the end to the start of the padding section */
|
|
data->vm_end = VMA_PAD_START(data);
|
|
|
|
return data;
|
|
}
|
|
|
|
/*
|
|
* Calls the show_pad_vma_fn on the @pad VMA, and frees the copies of @vma
|
|
* and @pad.
|
|
*/
|
|
void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad,
|
|
struct seq_file *m, void *func, bool smaps)
|
|
{
|
|
if (!pad)
|
|
return;
|
|
|
|
/*
|
|
* This cannot happen. If @pad vma was allocated the corresponding
|
|
* @vma should have the VM_PAD_MASK bit(s) set.
|
|
*/
|
|
BUG_ON(!(vma->vm_flags & VM_PAD_MASK));
|
|
|
|
/*
|
|
* This cannot happen. @pad is a section of the original VMA.
|
|
* Therefore @vma cannot be null if @pad is not null.
|
|
*/
|
|
BUG_ON(!vma);
|
|
|
|
if (smaps)
|
|
((show_pad_smaps_fn)func)(m, pad);
|
|
else
|
|
((show_pad_maps_fn)func)(m, pad);
|
|
|
|
kfree(pad);
|
|
kfree(vma);
|
|
}
|
|
|
|
/*
|
|
* When splitting a padding VMA there are a couple of cases to handle.
|
|
*
|
|
* Given:
|
|
*
|
|
* | DDDDPPPP |
|
|
*
|
|
* where:
|
|
* - D represents 1 page of data;
|
|
* - P represents 1 page of padding;
|
|
* - | represents the boundaries (start/end) of the VMA
|
|
*
|
|
*
|
|
* 1) Split exactly at the padding boundary
|
|
*
|
|
* | DDDDPPPP | --> | DDDD | PPPP |
|
|
*
|
|
* - Remove padding flags from the first VMA.
|
|
* - The second VMA is all padding
|
|
*
|
|
* 2) Split within the padding area
|
|
*
|
|
* | DDDDPPPP | --> | DDDDPP | PP |
|
|
*
|
|
* - Subtract the length of the second VMA from the first VMA's padding.
|
|
* - The second VMA is all padding, adjust its padding length (flags)
|
|
*
|
|
* 3) Split within the data area
|
|
*
|
|
* | DDDDPPPP | --> | DD | DDPPPP |
|
|
*
|
|
* - Remove padding flags from the first VMA.
|
|
* - The second VMA is has the same padding as from before the split.
|
|
*/
|
|
void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new,
|
|
unsigned long addr, int new_below)
|
|
{
|
|
unsigned long nr_pad_pages = vma_pad_pages(vma);
|
|
unsigned long nr_vma2_pages;
|
|
struct vm_area_struct *first;
|
|
struct vm_area_struct *second;
|
|
|
|
if (!nr_pad_pages)
|
|
return;
|
|
|
|
if (new_below) {
|
|
first = new;
|
|
second = vma;
|
|
} else {
|
|
first = vma;
|
|
second = new;
|
|
}
|
|
|
|
nr_vma2_pages = vma_pages(second);
|
|
|
|
if (nr_vma2_pages >= nr_pad_pages) { /* Case 1 & 3*/
|
|
vm_flags_clear(first, VM_PAD_MASK);
|
|
vma_set_pad_pages(second, nr_pad_pages);
|
|
} else { /* Case 2 */
|
|
vma_set_pad_pages(first, nr_pad_pages - nr_vma2_pages);
|
|
vma_set_pad_pages(second, nr_vma2_pages);
|
|
}
|
|
}
|
|
#endif /* PAGE_SIZE == SZ_4K */
|
|
#endif /* CONFIG_64BIT */
|