Merge 39bed42de2
("Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma") into android-mainline
Baby steps in the 5.6-rc1 merge cycle to make things easier to review and debug. Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I0fa183764fd1adbde44e8181f0b3df6cff4da18b
This commit is contained in:
commit
7881aee544
@ -149,14 +149,14 @@ CPU page table into a device page table; HMM helps keep both synchronized. A
|
||||
device driver that wants to mirror a process address space must start with the
|
||||
registration of a mmu_interval_notifier::
|
||||
|
||||
mni->ops = &driver_ops;
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
|
||||
unsigned long start, unsigned long length,
|
||||
struct mm_struct *mm);
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
|
||||
struct mm_struct *mm, unsigned long start,
|
||||
unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops);
|
||||
|
||||
During the driver_ops->invalidate() callback the device driver must perform
|
||||
the update action to the range (mark range read only, or fully unmap,
|
||||
etc.). The device must complete the update before the driver callback returns.
|
||||
During the ops->invalidate() callback the device driver must perform the
|
||||
update action to the range (mark range read only, or fully unmap, etc.). The
|
||||
device must complete the update before the driver callback returns.
|
||||
|
||||
When the device driver wants to populate a range of virtual addresses, it can
|
||||
use::
|
||||
@ -183,7 +183,7 @@ The usage pattern is::
|
||||
struct hmm_range range;
|
||||
...
|
||||
|
||||
range.notifier = &mni;
|
||||
range.notifier = &interval_sub;
|
||||
range.start = ...;
|
||||
range.end = ...;
|
||||
range.pfns = ...;
|
||||
@ -191,11 +191,11 @@ The usage pattern is::
|
||||
range.values = ...;
|
||||
range.pfn_shift = ...;
|
||||
|
||||
if (!mmget_not_zero(mni->notifier.mm))
|
||||
if (!mmget_not_zero(interval_sub->notifier.mm))
|
||||
return -EFAULT;
|
||||
|
||||
again:
|
||||
range.notifier_seq = mmu_interval_read_begin(&mni);
|
||||
range.notifier_seq = mmu_interval_read_begin(&interval_sub);
|
||||
down_read(&mm->mmap_sem);
|
||||
ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
|
||||
if (ret) {
|
||||
|
@ -476,3 +476,4 @@
|
||||
544 common pidfd_open sys_pidfd_open
|
||||
# 545 reserved for clone3
|
||||
547 common openat2 sys_openat2
|
||||
548 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -450,3 +450,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -38,7 +38,7 @@
|
||||
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
|
||||
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
|
||||
|
||||
#define __NR_compat_syscalls 438
|
||||
#define __NR_compat_syscalls 439
|
||||
#endif
|
||||
|
||||
#define __ARCH_WANT_SYS_CLONE
|
||||
|
@ -881,6 +881,8 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
|
||||
__SYSCALL(__NR_clone3, sys_clone3)
|
||||
#define __NR_openat2 437
|
||||
__SYSCALL(__NR_openat2, sys_openat2)
|
||||
#define __NR_pidfd_getfd 438
|
||||
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
|
||||
|
||||
/*
|
||||
* Please add new compat syscalls above this comment and update
|
||||
|
@ -357,3 +357,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
# 435 reserved for clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -436,3 +436,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 __sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -442,3 +442,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -375,3 +375,4 @@
|
||||
434 n32 pidfd_open sys_pidfd_open
|
||||
435 n32 clone3 __sys_clone3
|
||||
437 n32 openat2 sys_openat2
|
||||
438 n32 pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -351,3 +351,4 @@
|
||||
434 n64 pidfd_open sys_pidfd_open
|
||||
435 n64 clone3 __sys_clone3
|
||||
437 n64 openat2 sys_openat2
|
||||
438 n64 pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -424,3 +424,4 @@
|
||||
434 o32 pidfd_open sys_pidfd_open
|
||||
435 o32 clone3 __sys_clone3
|
||||
437 o32 openat2 sys_openat2
|
||||
438 o32 pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -434,3 +434,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3_wrapper
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -518,3 +518,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 nospu clone3 ppc_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -439,3 +439,4 @@
|
||||
434 common pidfd_open sys_pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3 sys_clone3
|
||||
437 common openat2 sys_openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -439,3 +439,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
# 435 reserved for clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -482,3 +482,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
# 435 reserved for clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -441,3 +441,4 @@
|
||||
434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
|
||||
435 i386 clone3 sys_clone3 __ia32_sys_clone3
|
||||
437 i386 openat2 sys_openat2 __ia32_sys_openat2
|
||||
438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd
|
||||
|
@ -358,6 +358,7 @@
|
||||
434 common pidfd_open __x64_sys_pidfd_open
|
||||
435 common clone3 __x64_sys_clone3/ptregs
|
||||
437 common openat2 __x64_sys_openat2
|
||||
438 common pidfd_getfd __x64_sys_pidfd_getfd
|
||||
|
||||
#
|
||||
# x32-specific system call numbers start at 512 to avoid cache impact
|
||||
|
@ -407,3 +407,4 @@
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
435 common clone3 sys_clone3
|
||||
437 common openat2 sys_openat2
|
||||
438 common pidfd_getfd sys_pidfd_getfd
|
||||
|
@ -2402,10 +2402,12 @@ static void binder_deferred_fd_close(int fd)
|
||||
return;
|
||||
init_task_work(&twcb->twork, binder_do_fd_close);
|
||||
__close_fd_get_file(fd, &twcb->file);
|
||||
if (twcb->file)
|
||||
if (twcb->file) {
|
||||
filp_close(twcb->file, current->files);
|
||||
task_work_add(current, &twcb->twork, true);
|
||||
else
|
||||
} else {
|
||||
kfree(twcb);
|
||||
}
|
||||
}
|
||||
|
||||
static void binder_transaction_buffer_release(struct binder_proc *proc,
|
||||
|
@ -355,12 +355,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
|
||||
return container_of(p, struct ep_pqueue, pt)->epi;
|
||||
}
|
||||
|
||||
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
|
||||
static inline int ep_op_has_event(int op)
|
||||
{
|
||||
return op != EPOLL_CTL_DEL;
|
||||
}
|
||||
|
||||
/* Initialize the poll safe wake up structure */
|
||||
static void ep_nested_calls_init(struct nested_calls *ncalls)
|
||||
{
|
||||
@ -2076,27 +2070,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
|
||||
return do_epoll_create(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The following function implements the controller interface for
|
||||
* the eventpoll file that enables the insertion/removal/change of
|
||||
* file descriptors inside the interest set.
|
||||
*/
|
||||
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
struct epoll_event __user *, event)
|
||||
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
|
||||
bool nonblock)
|
||||
{
|
||||
if (!nonblock) {
|
||||
mutex_lock_nested(mutex, depth);
|
||||
return 0;
|
||||
}
|
||||
if (mutex_trylock(mutex))
|
||||
return 0;
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
|
||||
bool nonblock)
|
||||
{
|
||||
int error;
|
||||
int full_check = 0;
|
||||
struct fd f, tf;
|
||||
struct eventpoll *ep;
|
||||
struct epitem *epi;
|
||||
struct epoll_event epds;
|
||||
struct eventpoll *tep = NULL;
|
||||
|
||||
error = -EFAULT;
|
||||
if (ep_op_has_event(op) &&
|
||||
copy_from_user(&epds, event, sizeof(struct epoll_event)))
|
||||
goto error_return;
|
||||
|
||||
error = -EBADF;
|
||||
f = fdget(epfd);
|
||||
if (!f.file)
|
||||
@ -2114,7 +2109,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
|
||||
/* Check if EPOLLWAKEUP is allowed */
|
||||
if (ep_op_has_event(op))
|
||||
ep_take_care_of_epollwakeup(&epds);
|
||||
ep_take_care_of_epollwakeup(epds);
|
||||
|
||||
/*
|
||||
* We have to check that the file structure underneath the file descriptor
|
||||
@ -2130,11 +2125,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
|
||||
* Also, we do not currently supported nested exclusive wakeups.
|
||||
*/
|
||||
if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
|
||||
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
|
||||
if (op == EPOLL_CTL_MOD)
|
||||
goto error_tgt_fput;
|
||||
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
|
||||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
|
||||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
|
||||
goto error_tgt_fput;
|
||||
}
|
||||
|
||||
@ -2159,13 +2154,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
* deep wakeup paths from forming in parallel through multiple
|
||||
* EPOLL_CTL_ADD operations.
|
||||
*/
|
||||
mutex_lock_nested(&ep->mtx, 0);
|
||||
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
|
||||
if (error)
|
||||
goto error_tgt_fput;
|
||||
if (op == EPOLL_CTL_ADD) {
|
||||
if (!list_empty(&f.file->f_ep_links) ||
|
||||
is_file_epoll(tf.file)) {
|
||||
full_check = 1;
|
||||
mutex_unlock(&ep->mtx);
|
||||
mutex_lock(&epmutex);
|
||||
error = epoll_mutex_lock(&epmutex, 0, nonblock);
|
||||
if (error)
|
||||
goto error_tgt_fput;
|
||||
full_check = 1;
|
||||
if (is_file_epoll(tf.file)) {
|
||||
error = -ELOOP;
|
||||
if (ep_loop_check(ep, tf.file) != 0) {
|
||||
@ -2175,10 +2174,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
} else
|
||||
list_add(&tf.file->f_tfile_llink,
|
||||
&tfile_check_list);
|
||||
mutex_lock_nested(&ep->mtx, 0);
|
||||
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
|
||||
if (error) {
|
||||
out_del:
|
||||
list_del(&tf.file->f_tfile_llink);
|
||||
goto error_tgt_fput;
|
||||
}
|
||||
if (is_file_epoll(tf.file)) {
|
||||
tep = tf.file->private_data;
|
||||
mutex_lock_nested(&tep->mtx, 1);
|
||||
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
|
||||
if (error) {
|
||||
mutex_unlock(&ep->mtx);
|
||||
goto out_del;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2194,8 +2202,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
switch (op) {
|
||||
case EPOLL_CTL_ADD:
|
||||
if (!epi) {
|
||||
epds.events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_insert(ep, &epds, tf.file, fd, full_check);
|
||||
epds->events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_insert(ep, epds, tf.file, fd, full_check);
|
||||
} else
|
||||
error = -EEXIST;
|
||||
if (full_check)
|
||||
@ -2210,8 +2218,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
case EPOLL_CTL_MOD:
|
||||
if (epi) {
|
||||
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
|
||||
epds.events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_modify(ep, epi, &epds);
|
||||
epds->events |= EPOLLERR | EPOLLHUP;
|
||||
error = ep_modify(ep, epi, epds);
|
||||
}
|
||||
} else
|
||||
error = -ENOENT;
|
||||
@ -2233,6 +2241,23 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* The following function implements the controller interface for
|
||||
* the eventpoll file that enables the insertion/removal/change of
|
||||
* file descriptors inside the interest set.
|
||||
*/
|
||||
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
|
||||
struct epoll_event __user *, event)
|
||||
{
|
||||
struct epoll_event epds;
|
||||
|
||||
if (ep_op_has_event(op) &&
|
||||
copy_from_user(&epds, event, sizeof(struct epoll_event)))
|
||||
return -EFAULT;
|
||||
|
||||
return do_epoll_ctl(epfd, op, fd, &epds, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Implement the event wait interface for the eventpoll file. It is the kernel
|
||||
* part of the user space epoll_wait(2).
|
||||
|
28
fs/file.c
28
fs/file.c
@ -642,7 +642,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
|
||||
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
|
||||
|
||||
/*
|
||||
* variant of __close_fd that gets a ref on the file for later fput
|
||||
* variant of __close_fd that gets a ref on the file for later fput.
|
||||
* The caller must ensure that filp_close() called on the file, and then
|
||||
* an fput().
|
||||
*/
|
||||
int __close_fd_get_file(unsigned int fd, struct file **res)
|
||||
{
|
||||
@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
|
||||
spin_unlock(&files->file_lock);
|
||||
get_file(file);
|
||||
*res = file;
|
||||
return filp_close(file, files);
|
||||
return 0;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&files->file_lock);
|
||||
@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files)
|
||||
spin_unlock(&files->file_lock);
|
||||
}
|
||||
|
||||
static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
|
||||
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
|
||||
fmode_t mask, unsigned int refs)
|
||||
{
|
||||
struct files_struct *files = current->files;
|
||||
struct file *file;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -729,6 +731,12 @@ static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
|
||||
return file;
|
||||
}
|
||||
|
||||
static inline struct file *__fget(unsigned int fd, fmode_t mask,
|
||||
unsigned int refs)
|
||||
{
|
||||
return __fget_files(current->files, fd, mask, refs);
|
||||
}
|
||||
|
||||
struct file *fget_many(unsigned int fd, unsigned int refs)
|
||||
{
|
||||
return __fget(fd, FMODE_PATH, refs);
|
||||
@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd)
|
||||
}
|
||||
EXPORT_SYMBOL(fget_raw);
|
||||
|
||||
struct file *fget_task(struct task_struct *task, unsigned int fd)
|
||||
{
|
||||
struct file *file = NULL;
|
||||
|
||||
task_lock(task);
|
||||
if (task->files)
|
||||
file = __fget_files(task->files, fd, 0, 1);
|
||||
task_unlock(task);
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
|
||||
*
|
||||
|
@ -122,6 +122,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
|
||||
const struct open_flags *op);
|
||||
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
|
||||
const char *, const struct open_flags *);
|
||||
extern struct open_how build_open_how(int flags, umode_t mode);
|
||||
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
|
||||
|
||||
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
|
||||
long do_faccessat(int dfd, const char __user *filename, int mode);
|
||||
@ -180,3 +182,9 @@ extern const struct dentry_operations ns_dentry_operations;
|
||||
|
||||
/* direct-io.c: */
|
||||
int sb_init_dio_done_wq(struct super_block *sb);
|
||||
|
||||
/*
|
||||
* fs/stat.c:
|
||||
*/
|
||||
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
|
||||
int cp_statx(const struct kstat *stat, struct statx __user *buffer);
|
||||
|
103
fs/io-wq.c
103
fs/io-wq.c
@ -56,7 +56,8 @@ struct io_worker {
|
||||
|
||||
struct rcu_head rcu;
|
||||
struct mm_struct *mm;
|
||||
const struct cred *creds;
|
||||
const struct cred *cur_creds;
|
||||
const struct cred *saved_creds;
|
||||
struct files_struct *restore_files;
|
||||
};
|
||||
|
||||
@ -109,10 +110,10 @@ struct io_wq {
|
||||
|
||||
struct task_struct *manager;
|
||||
struct user_struct *user;
|
||||
const struct cred *creds;
|
||||
struct mm_struct *mm;
|
||||
refcount_t refs;
|
||||
struct completion done;
|
||||
|
||||
refcount_t use_refs;
|
||||
};
|
||||
|
||||
static bool io_worker_get(struct io_worker *worker)
|
||||
@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
|
||||
{
|
||||
bool dropped_lock = false;
|
||||
|
||||
if (worker->creds) {
|
||||
revert_creds(worker->creds);
|
||||
worker->creds = NULL;
|
||||
if (worker->saved_creds) {
|
||||
revert_creds(worker->saved_creds);
|
||||
worker->cur_creds = worker->saved_creds = NULL;
|
||||
}
|
||||
|
||||
if (current->files != worker->restore_files) {
|
||||
@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
|
||||
{
|
||||
if (worker->mm) {
|
||||
unuse_mm(worker->mm);
|
||||
mmput(worker->mm);
|
||||
worker->mm = NULL;
|
||||
}
|
||||
if (!work->mm) {
|
||||
set_fs(KERNEL_DS);
|
||||
return;
|
||||
}
|
||||
if (mmget_not_zero(work->mm)) {
|
||||
use_mm(work->mm);
|
||||
if (!worker->mm)
|
||||
set_fs(USER_DS);
|
||||
worker->mm = work->mm;
|
||||
/* hang on to this mm */
|
||||
work->mm = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
/* failed grabbing mm, ensure work gets cancelled */
|
||||
work->flags |= IO_WQ_WORK_CANCEL;
|
||||
}
|
||||
|
||||
static void io_wq_switch_creds(struct io_worker *worker,
|
||||
struct io_wq_work *work)
|
||||
{
|
||||
const struct cred *old_creds = override_creds(work->creds);
|
||||
|
||||
worker->cur_creds = work->creds;
|
||||
if (worker->saved_creds)
|
||||
put_cred(old_creds); /* creds set by previous switch */
|
||||
else
|
||||
worker->saved_creds = old_creds;
|
||||
}
|
||||
|
||||
static void io_worker_handle_work(struct io_worker *worker)
|
||||
__releases(wqe->lock)
|
||||
{
|
||||
@ -438,24 +476,19 @@ static void io_worker_handle_work(struct io_worker *worker)
|
||||
if (work->flags & IO_WQ_WORK_CB)
|
||||
work->func(&work);
|
||||
|
||||
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
|
||||
current->files != work->files) {
|
||||
if (work->files && current->files != work->files) {
|
||||
task_lock(current);
|
||||
current->files = work->files;
|
||||
task_unlock(current);
|
||||
}
|
||||
if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
|
||||
wq->mm) {
|
||||
if (mmget_not_zero(wq->mm)) {
|
||||
use_mm(wq->mm);
|
||||
set_fs(USER_DS);
|
||||
worker->mm = wq->mm;
|
||||
} else {
|
||||
work->flags |= IO_WQ_WORK_CANCEL;
|
||||
}
|
||||
}
|
||||
if (!worker->creds)
|
||||
worker->creds = override_creds(wq->creds);
|
||||
if (work->mm != worker->mm)
|
||||
io_wq_switch_mm(worker, work);
|
||||
if (worker->cur_creds != work->creds)
|
||||
io_wq_switch_creds(worker, work);
|
||||
/*
|
||||
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
|
||||
* the worker function will do the right thing.
|
||||
*/
|
||||
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
|
||||
work->flags |= IO_WQ_WORK_CANCEL;
|
||||
if (worker->mm)
|
||||
@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
|
||||
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
|
||||
{
|
||||
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
|
||||
int work_flags;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
|
||||
return;
|
||||
}
|
||||
|
||||
work_flags = work->flags;
|
||||
spin_lock_irqsave(&wqe->lock, flags);
|
||||
wq_list_add_tail(&work->list, &wqe->work_list);
|
||||
wqe->flags &= ~IO_WQE_FLAG_STALLED;
|
||||
spin_unlock_irqrestore(&wqe->lock, flags);
|
||||
|
||||
if (!atomic_read(&acct->nr_running))
|
||||
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
|
||||
!atomic_read(&acct->nr_running))
|
||||
io_wqe_wake_worker(wqe, acct);
|
||||
}
|
||||
|
||||
@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
|
||||
*/
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
if (worker->cur_work &&
|
||||
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
|
||||
data->cancel(worker->cur_work, data->caller_data)) {
|
||||
send_sig(SIGINT, worker->task, 1);
|
||||
ret = true;
|
||||
@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
|
||||
return false;
|
||||
|
||||
spin_lock_irqsave(&worker->lock, flags);
|
||||
if (worker->cur_work == work) {
|
||||
if (worker->cur_work == work &&
|
||||
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
|
||||
send_sig(SIGINT, worker->task, 1);
|
||||
ret = true;
|
||||
}
|
||||
@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
|
||||
/* caller must already hold a reference to this */
|
||||
wq->user = data->user;
|
||||
wq->creds = data->creds;
|
||||
|
||||
for_each_node(node) {
|
||||
struct io_wqe *wqe;
|
||||
@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
|
||||
init_completion(&wq->done);
|
||||
|
||||
/* caller must have already done mmgrab() on this mm */
|
||||
wq->mm = data->mm;
|
||||
|
||||
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
|
||||
if (!IS_ERR(wq->manager)) {
|
||||
wake_up_process(wq->manager);
|
||||
@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
refcount_set(&wq->use_refs, 1);
|
||||
reinit_completion(&wq->done);
|
||||
return wq;
|
||||
}
|
||||
@ -1078,13 +1113,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
|
||||
{
|
||||
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
|
||||
return false;
|
||||
|
||||
return refcount_inc_not_zero(&wq->use_refs);
|
||||
}
|
||||
|
||||
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
|
||||
{
|
||||
wake_up_process(worker->task);
|
||||
return false;
|
||||
}
|
||||
|
||||
void io_wq_destroy(struct io_wq *wq)
|
||||
static void __io_wq_destroy(struct io_wq *wq)
|
||||
{
|
||||
int node;
|
||||
|
||||
@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
|
||||
kfree(wq->wqes);
|
||||
kfree(wq);
|
||||
}
|
||||
|
||||
void io_wq_destroy(struct io_wq *wq)
|
||||
{
|
||||
if (refcount_dec_and_test(&wq->use_refs))
|
||||
__io_wq_destroy(wq);
|
||||
}
|
||||
|
11
fs/io-wq.h
11
fs/io-wq.h
@ -7,11 +7,11 @@ enum {
|
||||
IO_WQ_WORK_CANCEL = 1,
|
||||
IO_WQ_WORK_HAS_MM = 2,
|
||||
IO_WQ_WORK_HASHED = 4,
|
||||
IO_WQ_WORK_NEEDS_USER = 8,
|
||||
IO_WQ_WORK_NEEDS_FILES = 16,
|
||||
IO_WQ_WORK_UNBOUND = 32,
|
||||
IO_WQ_WORK_INTERNAL = 64,
|
||||
IO_WQ_WORK_CB = 128,
|
||||
IO_WQ_WORK_NO_CANCEL = 256,
|
||||
IO_WQ_WORK_CONCURRENT = 512,
|
||||
|
||||
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
|
||||
};
|
||||
@ -72,6 +72,8 @@ struct io_wq_work {
|
||||
};
|
||||
void (*func)(struct io_wq_work **);
|
||||
struct files_struct *files;
|
||||
struct mm_struct *mm;
|
||||
const struct cred *creds;
|
||||
unsigned flags;
|
||||
};
|
||||
|
||||
@ -81,21 +83,22 @@ struct io_wq_work {
|
||||
(work)->func = _func; \
|
||||
(work)->flags = 0; \
|
||||
(work)->files = NULL; \
|
||||
(work)->mm = NULL; \
|
||||
(work)->creds = NULL; \
|
||||
} while (0) \
|
||||
|
||||
typedef void (get_work_fn)(struct io_wq_work *);
|
||||
typedef void (put_work_fn)(struct io_wq_work *);
|
||||
|
||||
struct io_wq_data {
|
||||
struct mm_struct *mm;
|
||||
struct user_struct *user;
|
||||
const struct cred *creds;
|
||||
|
||||
get_work_fn *get_work;
|
||||
put_work_fn *put_work;
|
||||
};
|
||||
|
||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
|
||||
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
|
||||
void io_wq_destroy(struct io_wq *wq);
|
||||
|
||||
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
||||
|
2218
fs/io_uring.c
2218
fs/io_uring.c
File diff suppressed because it is too large
Load Diff
@ -970,7 +970,7 @@ EXPORT_SYMBOL(open_with_fake_path);
|
||||
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
|
||||
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
|
||||
|
||||
static inline struct open_how build_open_how(int flags, umode_t mode)
|
||||
inline struct open_how build_open_how(int flags, umode_t mode)
|
||||
{
|
||||
struct open_how how = {
|
||||
.flags = flags & VALID_OPEN_FLAGS,
|
||||
@ -986,8 +986,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode)
|
||||
return how;
|
||||
}
|
||||
|
||||
static inline int build_open_flags(const struct open_how *how,
|
||||
struct open_flags *op)
|
||||
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
|
||||
{
|
||||
int flags = how->flags;
|
||||
int lookup_flags = 0;
|
||||
|
34
fs/stat.c
34
fs/stat.c
@ -21,6 +21,8 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/unistd.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
/**
|
||||
* generic_fillattr - Fill in the basic attributes from the inode struct
|
||||
* @inode: Inode to use as the source
|
||||
@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
|
||||
}
|
||||
EXPORT_SYMBOL(vfs_statx_fd);
|
||||
|
||||
inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
|
||||
{
|
||||
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
|
||||
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
|
||||
return -EINVAL;
|
||||
|
||||
*lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_SYMLINK_NOFOLLOW)
|
||||
*lookup_flags &= ~LOOKUP_FOLLOW;
|
||||
if (flags & AT_NO_AUTOMOUNT)
|
||||
*lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_EMPTY_PATH)
|
||||
*lookup_flags |= LOOKUP_EMPTY;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* vfs_statx - Get basic and extra attributes by filename
|
||||
* @dfd: A file descriptor representing the base dir for a relative filename
|
||||
@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
|
||||
{
|
||||
struct path path;
|
||||
int error = -EINVAL;
|
||||
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
|
||||
unsigned lookup_flags;
|
||||
|
||||
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
|
||||
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
|
||||
if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & AT_SYMLINK_NOFOLLOW)
|
||||
lookup_flags &= ~LOOKUP_FOLLOW;
|
||||
if (flags & AT_NO_AUTOMOUNT)
|
||||
lookup_flags &= ~LOOKUP_AUTOMOUNT;
|
||||
if (flags & AT_EMPTY_PATH)
|
||||
lookup_flags |= LOOKUP_EMPTY;
|
||||
|
||||
retry:
|
||||
error = user_path_at(dfd, filename, lookup_flags, &path);
|
||||
if (error)
|
||||
@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
|
||||
}
|
||||
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
|
||||
|
||||
static noinline_for_stack int
|
||||
noinline_for_stack int
|
||||
cp_statx(const struct kstat *stat, struct statx __user *buffer)
|
||||
{
|
||||
struct statx tmp;
|
||||
|
@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
|
||||
eventpoll_release_file(file);
|
||||
}
|
||||
|
||||
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
|
||||
bool nonblock);
|
||||
|
||||
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
|
||||
static inline int ep_op_has_event(int op)
|
||||
{
|
||||
return op != EPOLL_CTL_DEL;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void eventpoll_init_file(struct file *file) {}
|
||||
|
@ -16,6 +16,7 @@ extern void fput(struct file *);
|
||||
extern void fput_many(struct file *, unsigned int);
|
||||
|
||||
struct file_operations;
|
||||
struct task_struct;
|
||||
struct vfsmount;
|
||||
struct dentry;
|
||||
struct inode;
|
||||
@ -47,6 +48,7 @@ static inline void fdput(struct fd fd)
|
||||
extern struct file *fget(unsigned int fd);
|
||||
extern struct file *fget_many(unsigned int fd, unsigned int refs);
|
||||
extern struct file *fget_raw(unsigned int fd);
|
||||
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
|
||||
extern unsigned long __fdget(unsigned int fd);
|
||||
extern unsigned long __fdget_raw(unsigned int fd);
|
||||
extern unsigned long __fdget_pos(unsigned int fd);
|
||||
|
@ -2324,6 +2324,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
|
||||
struct list_head *uf, bool downgrade);
|
||||
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
|
||||
struct list_head *uf);
|
||||
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
|
||||
|
||||
static inline unsigned long
|
||||
do_mmap_pgoff(struct file *file, unsigned long addr,
|
||||
|
@ -497,7 +497,7 @@ struct mm_struct {
|
||||
/* store ref to file /proc/<pid>/exe symlink points to */
|
||||
struct file __rcu *exe_file;
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
struct mmu_notifier_mm *mmu_notifier_mm;
|
||||
struct mmu_notifier_subscriptions *notifier_subscriptions;
|
||||
#endif
|
||||
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
|
||||
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/interval_tree.h>
|
||||
|
||||
struct mmu_notifier_mm;
|
||||
struct mmu_notifier_subscriptions;
|
||||
struct mmu_notifier;
|
||||
struct mmu_notifier_range;
|
||||
struct mmu_interval_notifier;
|
||||
@ -73,7 +73,7 @@ struct mmu_notifier_ops {
|
||||
* through the gart alias address, so leading to memory
|
||||
* corruption.
|
||||
*/
|
||||
void (*release)(struct mmu_notifier *mn,
|
||||
void (*release)(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm);
|
||||
|
||||
/*
|
||||
@ -85,7 +85,7 @@ struct mmu_notifier_ops {
|
||||
* Start-end is necessary in case the secondary MMU is mapping the page
|
||||
* at a smaller granularity than the primary MMU.
|
||||
*/
|
||||
int (*clear_flush_young)(struct mmu_notifier *mn,
|
||||
int (*clear_flush_young)(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end);
|
||||
@ -95,7 +95,7 @@ struct mmu_notifier_ops {
|
||||
* latter, it is supposed to test-and-clear the young/accessed bitflag
|
||||
* in the secondary pte, but it may omit flushing the secondary tlb.
|
||||
*/
|
||||
int (*clear_young)(struct mmu_notifier *mn,
|
||||
int (*clear_young)(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end);
|
||||
@ -106,7 +106,7 @@ struct mmu_notifier_ops {
|
||||
* frequently used without actually clearing the flag or tearing
|
||||
* down the secondary mapping on the page.
|
||||
*/
|
||||
int (*test_young)(struct mmu_notifier *mn,
|
||||
int (*test_young)(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm,
|
||||
unsigned long address);
|
||||
|
||||
@ -114,7 +114,7 @@ struct mmu_notifier_ops {
|
||||
* change_pte is called in cases that pte mapping to page is changed:
|
||||
* for example, when ksm remaps pte to point to a new shared page.
|
||||
*/
|
||||
void (*change_pte)(struct mmu_notifier *mn,
|
||||
void (*change_pte)(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm,
|
||||
unsigned long address,
|
||||
pte_t pte);
|
||||
@ -169,9 +169,9 @@ struct mmu_notifier_ops {
|
||||
* invalidate_range_end.
|
||||
*
|
||||
*/
|
||||
int (*invalidate_range_start)(struct mmu_notifier *mn,
|
||||
int (*invalidate_range_start)(struct mmu_notifier *subscription,
|
||||
const struct mmu_notifier_range *range);
|
||||
void (*invalidate_range_end)(struct mmu_notifier *mn,
|
||||
void (*invalidate_range_end)(struct mmu_notifier *subscription,
|
||||
const struct mmu_notifier_range *range);
|
||||
|
||||
/*
|
||||
@ -192,8 +192,10 @@ struct mmu_notifier_ops {
|
||||
* of what was passed to invalidate_range_start()/end(), if
|
||||
* called between those functions.
|
||||
*/
|
||||
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end);
|
||||
void (*invalidate_range)(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end);
|
||||
|
||||
/*
|
||||
* These callbacks are used with the get/put interface to manage the
|
||||
@ -206,7 +208,7 @@ struct mmu_notifier_ops {
|
||||
* and cannot sleep.
|
||||
*/
|
||||
struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
|
||||
void (*free_notifier)(struct mmu_notifier *mn);
|
||||
void (*free_notifier)(struct mmu_notifier *subscription);
|
||||
};
|
||||
|
||||
/*
|
||||
@ -235,7 +237,7 @@ struct mmu_notifier {
|
||||
* was required but mmu_notifier_range_blockable(range) is false.
|
||||
*/
|
||||
struct mmu_interval_notifier_ops {
|
||||
bool (*invalidate)(struct mmu_interval_notifier *mni,
|
||||
bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq);
|
||||
};
|
||||
@ -265,7 +267,7 @@ struct mmu_notifier_range {
|
||||
|
||||
static inline int mm_has_notifiers(struct mm_struct *mm)
|
||||
{
|
||||
return unlikely(mm->mmu_notifier_mm);
|
||||
return unlikely(mm->notifier_subscriptions);
|
||||
}
|
||||
|
||||
struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
|
||||
@ -280,30 +282,31 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
|
||||
up_write(&mm->mmap_sem);
|
||||
return ret;
|
||||
}
|
||||
void mmu_notifier_put(struct mmu_notifier *mn);
|
||||
void mmu_notifier_put(struct mmu_notifier *subscription);
|
||||
void mmu_notifier_synchronize(void);
|
||||
|
||||
extern int mmu_notifier_register(struct mmu_notifier *mn,
|
||||
extern int mmu_notifier_register(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm);
|
||||
extern int __mmu_notifier_register(struct mmu_notifier *mn,
|
||||
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm);
|
||||
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
|
||||
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm);
|
||||
|
||||
unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni);
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
|
||||
unsigned long
|
||||
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
|
||||
struct mm_struct *mm, unsigned long start,
|
||||
unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops);
|
||||
int mmu_interval_notifier_insert_locked(
|
||||
struct mmu_interval_notifier *mni, struct mm_struct *mm,
|
||||
struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops);
|
||||
void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
|
||||
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);
|
||||
|
||||
/**
|
||||
* mmu_interval_set_seq - Save the invalidation sequence
|
||||
* @mni - The mni passed to invalidate
|
||||
* @interval_sub - The subscription passed to invalidate
|
||||
* @cur_seq - The cur_seq passed to the invalidate() callback
|
||||
*
|
||||
* This must be called unconditionally from the invalidate callback of a
|
||||
@ -314,15 +317,16 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
|
||||
* If the caller does not call mmu_interval_read_begin() or
|
||||
* mmu_interval_read_retry() then this call is not required.
|
||||
*/
|
||||
static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
|
||||
unsigned long cur_seq)
|
||||
static inline void
|
||||
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
WRITE_ONCE(mni->invalidate_seq, cur_seq);
|
||||
WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_read_retry - End a read side critical section against a VA range
|
||||
* mni: The range
|
||||
* interval_sub: The subscription
|
||||
* seq: The return of the paired mmu_interval_read_begin()
|
||||
*
|
||||
* This MUST be called under a user provided lock that is also held
|
||||
@ -334,15 +338,16 @@ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
|
||||
* Returns true if an invalidation collided with this critical section, and
|
||||
* the caller should retry.
|
||||
*/
|
||||
static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
|
||||
unsigned long seq)
|
||||
static inline bool
|
||||
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
|
||||
unsigned long seq)
|
||||
{
|
||||
return mni->invalidate_seq != seq;
|
||||
return interval_sub->invalidate_seq != seq;
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_check_retry - Test if a collision has occurred
|
||||
* mni: The range
|
||||
* interval_sub: The subscription
|
||||
* seq: The return of the matching mmu_interval_read_begin()
|
||||
*
|
||||
* This can be used in the critical section between mmu_interval_read_begin()
|
||||
@ -357,14 +362,15 @@ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
|
||||
* This call can be used as part of loops and other expensive operations to
|
||||
* expedite a retry.
|
||||
*/
|
||||
static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni,
|
||||
unsigned long seq)
|
||||
static inline bool
|
||||
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
|
||||
unsigned long seq)
|
||||
{
|
||||
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
|
||||
return READ_ONCE(mni->invalidate_seq) != seq;
|
||||
return READ_ONCE(interval_sub->invalidate_seq) != seq;
|
||||
}
|
||||
|
||||
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
|
||||
extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
|
||||
extern void __mmu_notifier_release(struct mm_struct *mm);
|
||||
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
@ -480,15 +486,15 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
|
||||
__mmu_notifier_invalidate_range(mm, start, end);
|
||||
}
|
||||
|
||||
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
|
||||
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
|
||||
{
|
||||
mm->mmu_notifier_mm = NULL;
|
||||
mm->notifier_subscriptions = NULL;
|
||||
}
|
||||
|
||||
static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
|
||||
static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
|
||||
{
|
||||
if (mm_has_notifiers(mm))
|
||||
__mmu_notifier_mm_destroy(mm);
|
||||
__mmu_notifier_subscriptions_destroy(mm);
|
||||
}
|
||||
|
||||
|
||||
@ -692,11 +698,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
|
||||
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
|
||||
static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -209,6 +209,36 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
|
||||
percpu_ref_get_many(ref, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* percpu_ref_tryget_many - try to increment a percpu refcount
|
||||
* @ref: percpu_ref to try-get
|
||||
* @nr: number of references to get
|
||||
*
|
||||
* Increment a percpu refcount by @nr unless its count already reached zero.
|
||||
* Returns %true on success; %false on failure.
|
||||
*
|
||||
* This function is safe to call as long as @ref is between init and exit.
|
||||
*/
|
||||
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
|
||||
unsigned long nr)
|
||||
{
|
||||
unsigned long __percpu *percpu_count;
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (__ref_is_percpu(ref, &percpu_count)) {
|
||||
this_cpu_add(*percpu_count, nr);
|
||||
ret = true;
|
||||
} else {
|
||||
ret = atomic_long_add_unless(&ref->count, nr, 0);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* percpu_ref_tryget - try to increment a percpu refcount
|
||||
* @ref: percpu_ref to try-get
|
||||
@ -220,21 +250,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
|
||||
*/
|
||||
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
|
||||
{
|
||||
unsigned long __percpu *percpu_count;
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (__ref_is_percpu(ref, &percpu_count)) {
|
||||
this_cpu_inc(*percpu_count);
|
||||
ret = true;
|
||||
} else {
|
||||
ret = atomic_long_inc_not_zero(&ref->count);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
return percpu_ref_tryget_many(ref, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -921,7 +921,7 @@ struct task_struct {
|
||||
|
||||
/* Signal handlers: */
|
||||
struct signal_struct *signal;
|
||||
struct sighand_struct *sighand;
|
||||
struct sighand_struct __rcu *sighand;
|
||||
sigset_t blocked;
|
||||
sigset_t real_blocked;
|
||||
/* Restored if set_restore_sigmask() was used: */
|
||||
|
@ -1002,6 +1002,7 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
|
||||
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
|
||||
siginfo_t __user *info,
|
||||
unsigned int flags);
|
||||
asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
|
||||
|
||||
/*
|
||||
* Architecture-specific system calls
|
||||
|
@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
|
||||
* io_uring_submit_sqe - called before submitting one SQE
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @opcode: opcode of request
|
||||
* @user_data: user data associated with the request
|
||||
* @force_nonblock: whether a context blocking or not
|
||||
* @sq_thread: true if sq_thread has submitted this SQE
|
||||
@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
|
||||
*/
|
||||
TRACE_EVENT(io_uring_submit_sqe,
|
||||
|
||||
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
|
||||
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
|
||||
bool sq_thread),
|
||||
|
||||
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
|
||||
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u8, opcode )
|
||||
__field( u64, user_data )
|
||||
__field( bool, force_nonblock )
|
||||
__field( bool, sq_thread )
|
||||
@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->user_data = user_data;
|
||||
__entry->force_nonblock = force_nonblock;
|
||||
__entry->sq_thread = sq_thread;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
|
||||
__entry->ctx, (unsigned long long) __entry->user_data,
|
||||
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
|
||||
__entry->ctx, __entry->opcode,
|
||||
(unsigned long long) __entry->user_data,
|
||||
__entry->force_nonblock, __entry->sq_thread)
|
||||
);
|
||||
|
||||
|
@ -853,9 +853,11 @@ __SYSCALL(__NR_clone3, sys_clone3)
|
||||
|
||||
#define __NR_openat2 437
|
||||
__SYSCALL(__NR_openat2, sys_openat2)
|
||||
#define __NR_pidfd_getfd 438
|
||||
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
|
||||
|
||||
#undef __NR_syscalls
|
||||
#define __NR_syscalls 438
|
||||
#define __NR_syscalls 439
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
|
@ -301,6 +301,7 @@ struct vfs_ns_cap_data {
|
||||
/* Allow more than 64hz interrupts from the real-time clock */
|
||||
/* Override max number of consoles on console allocation */
|
||||
/* Override max number of keymaps */
|
||||
/* Control memory reclaim behavior */
|
||||
|
||||
#define CAP_SYS_RESOURCE 24
|
||||
|
||||
|
@ -34,21 +34,43 @@ struct io_uring_sqe {
|
||||
__u32 timeout_flags;
|
||||
__u32 accept_flags;
|
||||
__u32 cancel_flags;
|
||||
__u32 open_flags;
|
||||
__u32 statx_flags;
|
||||
__u32 fadvise_advice;
|
||||
};
|
||||
__u64 user_data; /* data to be passed back at completion time */
|
||||
union {
|
||||
__u16 buf_index; /* index into fixed buffers, if used */
|
||||
struct {
|
||||
/* index into fixed buffers, if used */
|
||||
__u16 buf_index;
|
||||
/* personality to use, if used */
|
||||
__u16 personality;
|
||||
};
|
||||
__u64 __pad2[3];
|
||||
};
|
||||
};
|
||||
|
||||
enum {
|
||||
IOSQE_FIXED_FILE_BIT,
|
||||
IOSQE_IO_DRAIN_BIT,
|
||||
IOSQE_IO_LINK_BIT,
|
||||
IOSQE_IO_HARDLINK_BIT,
|
||||
IOSQE_ASYNC_BIT,
|
||||
};
|
||||
|
||||
/*
|
||||
* sqe->flags
|
||||
*/
|
||||
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
|
||||
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
|
||||
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
|
||||
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
|
||||
/* use fixed fileset */
|
||||
#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
|
||||
/* issue after inflight IO */
|
||||
#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
|
||||
/* links next sqe */
|
||||
#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
|
||||
/* like LINK, but stronger */
|
||||
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
|
||||
/* always go async */
|
||||
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
|
||||
|
||||
/*
|
||||
* io_uring_setup() flags
|
||||
@ -57,6 +79,8 @@ struct io_uring_sqe {
|
||||
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
|
||||
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
|
||||
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
|
||||
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
|
||||
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
|
||||
|
||||
enum {
|
||||
IORING_OP_NOP,
|
||||
@ -76,6 +100,19 @@ enum {
|
||||
IORING_OP_ASYNC_CANCEL,
|
||||
IORING_OP_LINK_TIMEOUT,
|
||||
IORING_OP_CONNECT,
|
||||
IORING_OP_FALLOCATE,
|
||||
IORING_OP_OPENAT,
|
||||
IORING_OP_CLOSE,
|
||||
IORING_OP_FILES_UPDATE,
|
||||
IORING_OP_STATX,
|
||||
IORING_OP_READ,
|
||||
IORING_OP_WRITE,
|
||||
IORING_OP_FADVISE,
|
||||
IORING_OP_MADVISE,
|
||||
IORING_OP_SEND,
|
||||
IORING_OP_RECV,
|
||||
IORING_OP_OPENAT2,
|
||||
IORING_OP_EPOLL_CTL,
|
||||
|
||||
/* this goes last, obviously */
|
||||
IORING_OP_LAST,
|
||||
@ -153,7 +190,8 @@ struct io_uring_params {
|
||||
__u32 sq_thread_cpu;
|
||||
__u32 sq_thread_idle;
|
||||
__u32 features;
|
||||
__u32 resv[4];
|
||||
__u32 wq_fd;
|
||||
__u32 resv[3];
|
||||
struct io_sqring_offsets sq_off;
|
||||
struct io_cqring_offsets cq_off;
|
||||
};
|
||||
@ -164,6 +202,8 @@ struct io_uring_params {
|
||||
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
|
||||
#define IORING_FEAT_NODROP (1U << 1)
|
||||
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
|
||||
#define IORING_FEAT_RW_CUR_POS (1U << 3)
|
||||
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
|
||||
|
||||
/*
|
||||
* io_uring_register(2) opcodes and arguments
|
||||
@ -175,6 +215,10 @@ struct io_uring_params {
|
||||
#define IORING_REGISTER_EVENTFD 4
|
||||
#define IORING_UNREGISTER_EVENTFD 5
|
||||
#define IORING_REGISTER_FILES_UPDATE 6
|
||||
#define IORING_REGISTER_EVENTFD_ASYNC 7
|
||||
#define IORING_REGISTER_PROBE 8
|
||||
#define IORING_REGISTER_PERSONALITY 9
|
||||
#define IORING_UNREGISTER_PERSONALITY 10
|
||||
|
||||
struct io_uring_files_update {
|
||||
__u32 offset;
|
||||
@ -182,4 +226,21 @@ struct io_uring_files_update {
|
||||
__aligned_u64 /* __s32 * */ fds;
|
||||
};
|
||||
|
||||
#define IO_URING_OP_SUPPORTED (1U << 0)
|
||||
|
||||
struct io_uring_probe_op {
|
||||
__u8 op;
|
||||
__u8 resv;
|
||||
__u16 flags; /* IO_URING_OP_* flags */
|
||||
__u32 resv2;
|
||||
};
|
||||
|
||||
struct io_uring_probe {
|
||||
__u8 last_op; /* last opcode supported */
|
||||
__u8 ops_len; /* length of ops[] array below */
|
||||
__u16 resv;
|
||||
__u32 resv2[3];
|
||||
struct io_uring_probe_op ops[0];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -234,6 +234,10 @@ struct prctl_mm_map {
|
||||
#define PR_GET_TAGGED_ADDR_CTRL 56
|
||||
# define PR_TAGGED_ADDR_ENABLE (1UL << 0)
|
||||
|
||||
/* Control reclaim behavior when allocating memory */
|
||||
#define PR_SET_IO_FLUSHER 57
|
||||
#define PR_GET_IO_FLUSHER 58
|
||||
|
||||
#define PR_SET_VMA 0x53564d41
|
||||
# define PR_SET_VMA_ANON_NAME 0
|
||||
|
||||
|
@ -697,7 +697,7 @@ void __mmdrop(struct mm_struct *mm)
|
||||
WARN_ON_ONCE(mm == current->active_mm);
|
||||
mm_free_pgd(mm);
|
||||
destroy_context(mm);
|
||||
mmu_notifier_mm_destroy(mm);
|
||||
mmu_notifier_subscriptions_destroy(mm);
|
||||
check_mm(mm);
|
||||
put_user_ns(mm->user_ns);
|
||||
free_mm(mm);
|
||||
@ -1036,7 +1036,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
||||
mm_init_aio(mm);
|
||||
mm_init_owner(mm, p);
|
||||
RCU_INIT_POINTER(mm->exe_file, NULL);
|
||||
mmu_notifier_mm_init(mm);
|
||||
mmu_notifier_subscriptions_init(mm);
|
||||
init_tlb_flush_pending(mm);
|
||||
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
|
||||
mm->pmd_huge_pte = NULL;
|
||||
|
90
kernel/pid.c
90
kernel/pid.c
@ -578,3 +578,93 @@ void __init pid_idr_init(void)
|
||||
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
|
||||
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
|
||||
}
|
||||
|
||||
static struct file *__pidfd_fget(struct task_struct *task, int fd)
|
||||
{
|
||||
struct file *file;
|
||||
int ret;
|
||||
|
||||
ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
|
||||
file = fget_task(task, fd);
|
||||
else
|
||||
file = ERR_PTR(-EPERM);
|
||||
|
||||
mutex_unlock(&task->signal->cred_guard_mutex);
|
||||
|
||||
return file ?: ERR_PTR(-EBADF);
|
||||
}
|
||||
|
||||
static int pidfd_getfd(struct pid *pid, int fd)
|
||||
{
|
||||
struct task_struct *task;
|
||||
struct file *file;
|
||||
int ret;
|
||||
|
||||
task = get_pid_task(pid, PIDTYPE_PID);
|
||||
if (!task)
|
||||
return -ESRCH;
|
||||
|
||||
file = __pidfd_fget(task, fd);
|
||||
put_task_struct(task);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
|
||||
ret = security_file_receive(file);
|
||||
if (ret) {
|
||||
fput(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = get_unused_fd_flags(O_CLOEXEC);
|
||||
if (ret < 0)
|
||||
fput(file);
|
||||
else
|
||||
fd_install(ret, file);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* sys_pidfd_getfd() - Get a file descriptor from another process
|
||||
*
|
||||
* @pidfd: the pidfd file descriptor of the process
|
||||
* @fd: the file descriptor number to get
|
||||
* @flags: flags on how to get the fd (reserved)
|
||||
*
|
||||
* This syscall gets a copy of a file descriptor from another process
|
||||
* based on the pidfd, and file descriptor number. It requires that
|
||||
* the calling process has the ability to ptrace the process represented
|
||||
* by the pidfd. The process which is having its file descriptor copied
|
||||
* is otherwise unaffected.
|
||||
*
|
||||
* Return: On success, a cloexec file descriptor is returned.
|
||||
* On error, a negative errno number will be returned.
|
||||
*/
|
||||
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
|
||||
unsigned int, flags)
|
||||
{
|
||||
struct pid *pid;
|
||||
struct fd f;
|
||||
int ret;
|
||||
|
||||
/* flags is currently unused - make sure it's unset */
|
||||
if (flags)
|
||||
return -EINVAL;
|
||||
|
||||
f = fdget(pidfd);
|
||||
if (!f.file)
|
||||
return -EBADF;
|
||||
|
||||
pid = pidfd_pid(f.file);
|
||||
if (IS_ERR(pid))
|
||||
ret = PTR_ERR(pid);
|
||||
else
|
||||
ret = pidfd_getfd(pid, fd);
|
||||
|
||||
fdput(f);
|
||||
return ret;
|
||||
}
|
||||
|
@ -1383,7 +1383,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
|
||||
* must see ->sighand == NULL.
|
||||
*/
|
||||
spin_lock_irqsave(&sighand->siglock, *flags);
|
||||
if (likely(sighand == tsk->sighand))
|
||||
if (likely(sighand == rcu_access_pointer(tsk->sighand)))
|
||||
break;
|
||||
spin_unlock_irqrestore(&sighand->siglock, *flags);
|
||||
}
|
||||
|
25
kernel/sys.c
25
kernel/sys.c
@ -2410,6 +2410,8 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
|
||||
}
|
||||
#endif
|
||||
|
||||
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
|
||||
|
||||
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
unsigned long, arg4, unsigned long, arg5)
|
||||
{
|
||||
@ -2640,6 +2642,29 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
return -EINVAL;
|
||||
error = GET_TAGGED_ADDR_CTRL();
|
||||
break;
|
||||
case PR_SET_IO_FLUSHER:
|
||||
if (!capable(CAP_SYS_RESOURCE))
|
||||
return -EPERM;
|
||||
|
||||
if (arg3 || arg4 || arg5)
|
||||
return -EINVAL;
|
||||
|
||||
if (arg2 == 1)
|
||||
current->flags |= PR_IO_FLUSHER;
|
||||
else if (!arg2)
|
||||
current->flags &= ~PR_IO_FLUSHER;
|
||||
else
|
||||
return -EINVAL;
|
||||
break;
|
||||
case PR_GET_IO_FLUSHER:
|
||||
if (!capable(CAP_SYS_RESOURCE))
|
||||
return -EPERM;
|
||||
|
||||
if (arg2 || arg3 || arg4 || arg5)
|
||||
return -EINVAL;
|
||||
|
||||
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
|
||||
break;
|
||||
default:
|
||||
error = -EINVAL;
|
||||
break;
|
||||
|
@ -153,7 +153,7 @@ void dump_mm(const struct mm_struct *mm)
|
||||
#endif
|
||||
"exe_file %px\n"
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
"mmu_notifier_mm %px\n"
|
||||
"notifier_subscriptions %px\n"
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
|
||||
@ -185,7 +185,7 @@ void dump_mm(const struct mm_struct *mm)
|
||||
#endif
|
||||
mm->exe_file,
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
mm->mmu_notifier_mm,
|
||||
mm->notifier_subscriptions,
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
|
||||
|
@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
|
||||
* -EBADF - map exists, but area maps something that isn't a file.
|
||||
* -EAGAIN - a kernel resource was temporarily unavailable.
|
||||
*/
|
||||
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||
int do_madvise(unsigned long start, size_t len_in, int behavior)
|
||||
{
|
||||
unsigned long end, tmp;
|
||||
struct vm_area_struct *vma, *prev;
|
||||
@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||
{
|
||||
return do_madvise(start, len_in, behavior);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
1
tools/testing/selftests/pidfd/.gitignore
vendored
1
tools/testing/selftests/pidfd/.gitignore
vendored
@ -2,3 +2,4 @@ pidfd_open_test
|
||||
pidfd_poll_test
|
||||
pidfd_test
|
||||
pidfd_wait
|
||||
pidfd_getfd_test
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
CFLAGS += -g -I../../../../usr/include/ -pthread
|
||||
|
||||
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait
|
||||
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test
|
||||
|
||||
include ../lib.mk
|
||||
|
||||
|
@ -36,6 +36,10 @@
|
||||
#define __NR_clone3 -1
|
||||
#endif
|
||||
|
||||
#ifndef __NR_pidfd_getfd
|
||||
#define __NR_pidfd_getfd -1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
|
||||
* That means, when it wraps around any pid < 300 will be skipped.
|
||||
@ -84,4 +88,9 @@ static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
|
||||
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
|
||||
}
|
||||
|
||||
static inline int sys_pidfd_getfd(int pidfd, int fd, int flags)
|
||||
{
|
||||
return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
|
||||
}
|
||||
|
||||
#endif /* __PIDFD_H */
|
||||
|
249
tools/testing/selftests/pidfd/pidfd_getfd_test.c
Normal file
249
tools/testing/selftests/pidfd/pidfd_getfd_test.c
Normal file
@ -0,0 +1,249 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <linux/types.h>
|
||||
#include <sched.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <syscall.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/socket.h>
|
||||
#include <linux/kcmp.h>
|
||||
|
||||
#include "pidfd.h"
|
||||
#include "../kselftest.h"
|
||||
#include "../kselftest_harness.h"
|
||||
|
||||
/*
|
||||
* UNKNOWN_FD is an fd number that should never exist in the child, as it is
|
||||
* used to check the negative case.
|
||||
*/
|
||||
#define UNKNOWN_FD 111
|
||||
#define UID_NOBODY 65535
|
||||
|
||||
static int sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1,
|
||||
unsigned long idx2)
|
||||
{
|
||||
return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2);
|
||||
}
|
||||
|
||||
static int sys_memfd_create(const char *name, unsigned int flags)
|
||||
{
|
||||
return syscall(__NR_memfd_create, name, flags);
|
||||
}
|
||||
|
||||
static int __child(int sk, int memfd)
|
||||
{
|
||||
int ret;
|
||||
char buf;
|
||||
|
||||
/*
|
||||
* Ensure we don't leave around a bunch of orphaned children if our
|
||||
* tests fail.
|
||||
*/
|
||||
ret = prctl(PR_SET_PDEATHSIG, SIGKILL);
|
||||
if (ret) {
|
||||
fprintf(stderr, "%s: Child could not set DEATHSIG\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = send(sk, &memfd, sizeof(memfd), 0);
|
||||
if (ret != sizeof(memfd)) {
|
||||
fprintf(stderr, "%s: Child failed to send fd number\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* The fixture setup is completed at this point. The tests will run.
|
||||
*
|
||||
* This blocking recv enables the parent to message the child.
|
||||
* Either we will read 'P' off of the sk, indicating that we need
|
||||
* to disable ptrace, or we will read a 0, indicating that the other
|
||||
* side has closed the sk. This occurs during fixture teardown time,
|
||||
* indicating that the child should exit.
|
||||
*/
|
||||
while ((ret = recv(sk, &buf, sizeof(buf), 0)) > 0) {
|
||||
if (buf == 'P') {
|
||||
ret = prctl(PR_SET_DUMPABLE, 0);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr,
|
||||
"%s: Child failed to disable ptrace\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Child received unknown command %c\n",
|
||||
buf);
|
||||
return -1;
|
||||
}
|
||||
ret = send(sk, &buf, sizeof(buf), 0);
|
||||
if (ret != 1) {
|
||||
fprintf(stderr, "%s: Child failed to ack\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "%s: Child failed to read from socket\n",
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int child(int sk)
|
||||
{
|
||||
int memfd, ret;
|
||||
|
||||
memfd = sys_memfd_create("test", 0);
|
||||
if (memfd < 0) {
|
||||
fprintf(stderr, "%s: Child could not create memfd\n",
|
||||
strerror(errno));
|
||||
ret = -1;
|
||||
} else {
|
||||
ret = __child(sk, memfd);
|
||||
close(memfd);
|
||||
}
|
||||
|
||||
close(sk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
FIXTURE(child)
|
||||
{
|
||||
/*
|
||||
* remote_fd is the number of the FD which we are trying to retrieve
|
||||
* from the child.
|
||||
*/
|
||||
int remote_fd;
|
||||
/* pid points to the child which we are fetching FDs from */
|
||||
pid_t pid;
|
||||
/* pidfd is the pidfd of the child */
|
||||
int pidfd;
|
||||
/*
|
||||
* sk is our side of the socketpair used to communicate with the child.
|
||||
* When it is closed, the child will exit.
|
||||
*/
|
||||
int sk;
|
||||
};
|
||||
|
||||
FIXTURE_SETUP(child)
|
||||
{
|
||||
int ret, sk_pair[2];
|
||||
|
||||
ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
|
||||
TH_LOG("%s: failed to create socketpair", strerror(errno));
|
||||
}
|
||||
self->sk = sk_pair[0];
|
||||
|
||||
self->pid = fork();
|
||||
ASSERT_GE(self->pid, 0);
|
||||
|
||||
if (self->pid == 0) {
|
||||
close(sk_pair[0]);
|
||||
if (child(sk_pair[1]))
|
||||
_exit(EXIT_FAILURE);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
close(sk_pair[1]);
|
||||
|
||||
self->pidfd = sys_pidfd_open(self->pid, 0);
|
||||
ASSERT_GE(self->pidfd, 0);
|
||||
|
||||
/*
|
||||
* Wait for the child to complete setup. It'll send the remote memfd's
|
||||
* number when ready.
|
||||
*/
|
||||
ret = recv(sk_pair[0], &self->remote_fd, sizeof(self->remote_fd), 0);
|
||||
ASSERT_EQ(sizeof(self->remote_fd), ret);
|
||||
}
|
||||
|
||||
FIXTURE_TEARDOWN(child)
|
||||
{
|
||||
EXPECT_EQ(0, close(self->pidfd));
|
||||
EXPECT_EQ(0, close(self->sk));
|
||||
|
||||
EXPECT_EQ(0, wait_for_pid(self->pid));
|
||||
}
|
||||
|
||||
TEST_F(child, disable_ptrace)
|
||||
{
|
||||
int uid, fd;
|
||||
char c;
|
||||
|
||||
/*
|
||||
* Turn into nobody if we're root, to avoid CAP_SYS_PTRACE
|
||||
*
|
||||
* The tests should run in their own process, so even this test fails,
|
||||
* it shouldn't result in subsequent tests failing.
|
||||
*/
|
||||
uid = getuid();
|
||||
if (uid == 0)
|
||||
ASSERT_EQ(0, seteuid(UID_NOBODY));
|
||||
|
||||
ASSERT_EQ(1, send(self->sk, "P", 1, 0));
|
||||
ASSERT_EQ(1, recv(self->sk, &c, 1, 0));
|
||||
|
||||
fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
|
||||
EXPECT_EQ(-1, fd);
|
||||
EXPECT_EQ(EPERM, errno);
|
||||
|
||||
if (uid == 0)
|
||||
ASSERT_EQ(0, seteuid(0));
|
||||
}
|
||||
|
||||
TEST_F(child, fetch_fd)
|
||||
{
|
||||
int fd, ret;
|
||||
|
||||
fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
|
||||
ASSERT_GE(fd, 0);
|
||||
|
||||
EXPECT_EQ(0, sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd));
|
||||
|
||||
ret = fcntl(fd, F_GETFD);
|
||||
ASSERT_GE(ret, 0);
|
||||
EXPECT_GE(ret & FD_CLOEXEC, 0);
|
||||
|
||||
close(fd);
|
||||
}
|
||||
|
||||
TEST_F(child, test_unknown_fd)
|
||||
{
|
||||
int fd;
|
||||
|
||||
fd = sys_pidfd_getfd(self->pidfd, UNKNOWN_FD, 0);
|
||||
EXPECT_EQ(-1, fd) {
|
||||
TH_LOG("getfd succeeded while fetching unknown fd");
|
||||
};
|
||||
EXPECT_EQ(EBADF, errno) {
|
||||
TH_LOG("%s: getfd did not get EBADF", strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(flags_set)
|
||||
{
|
||||
ASSERT_EQ(-1, sys_pidfd_getfd(0, 0, 1));
|
||||
EXPECT_EQ(errno, EINVAL);
|
||||
}
|
||||
|
||||
#if __NR_pidfd_getfd == -1
|
||||
int main(void)
|
||||
{
|
||||
fprintf(stderr, "__NR_pidfd_getfd undefined. The pidfd_getfd syscall is unavailable. Test aborting\n");
|
||||
return KSFT_SKIP;
|
||||
}
|
||||
#else
|
||||
TEST_HARNESS_MAIN
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user