Merge 39bed42de2 ("Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma") into android-mainline

Baby steps in the 5.6-rc1 merge cycle to make things easier to review
and debug.

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I0fa183764fd1adbde44e8181f0b3df6cff4da18b
This commit is contained in:
Greg Kroah-Hartman 2020-02-03 15:34:37 +00:00
commit 7881aee544
52 changed files with 2933 additions and 829 deletions

View File

@ -149,14 +149,14 @@ CPU page table into a device page table; HMM helps keep both synchronized. A
device driver that wants to mirror a process address space must start with the
registration of a mmu_interval_notifier::
mni->ops = &driver_ops;
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
unsigned long start, unsigned long length,
struct mm_struct *mm);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
struct mm_struct *mm, unsigned long start,
unsigned long length,
const struct mmu_interval_notifier_ops *ops);
During the driver_ops->invalidate() callback the device driver must perform
the update action to the range (mark range read only, or fully unmap,
etc.). The device must complete the update before the driver callback returns.
During the ops->invalidate() callback the device driver must perform the
update action to the range (mark range read only, or fully unmap, etc.). The
device must complete the update before the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use::
@ -183,7 +183,7 @@ The usage pattern is::
struct hmm_range range;
...
range.notifier = &mni;
range.notifier = &interval_sub;
range.start = ...;
range.end = ...;
range.pfns = ...;
@ -191,11 +191,11 @@ The usage pattern is::
range.values = ...;
range.pfn_shift = ...;
if (!mmget_not_zero(mni->notifier.mm))
if (!mmget_not_zero(interval_sub->notifier.mm))
return -EFAULT;
again:
range.notifier_seq = mmu_interval_read_begin(&mni);
range.notifier_seq = mmu_interval_read_begin(&interval_sub);
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
if (ret) {

View File

@ -476,3 +476,4 @@
544 common pidfd_open sys_pidfd_open
# 545 reserved for clone3
547 common openat2 sys_openat2
548 common pidfd_getfd sys_pidfd_getfd

View File

@ -450,3 +450,4 @@
434 common pidfd_open sys_pidfd_open
435 common clone3 sys_clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
#define __NR_compat_syscalls 438
#define __NR_compat_syscalls 439
#endif
#define __ARCH_WANT_SYS_CLONE

View File

@ -881,6 +881,8 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
__SYSCALL(__NR_clone3, sys_clone3)
#define __NR_openat2 437
__SYSCALL(__NR_openat2, sys_openat2)
#define __NR_pidfd_getfd 438
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
/*
* Please add new compat syscalls above this comment and update

View File

@ -357,3 +357,4 @@
434 common pidfd_open sys_pidfd_open
# 435 reserved for clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -436,3 +436,4 @@
434 common pidfd_open sys_pidfd_open
435 common clone3 __sys_clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -442,3 +442,4 @@
434 common pidfd_open sys_pidfd_open
435 common clone3 sys_clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -375,3 +375,4 @@
434 n32 pidfd_open sys_pidfd_open
435 n32 clone3 __sys_clone3
437 n32 openat2 sys_openat2
438 n32 pidfd_getfd sys_pidfd_getfd

View File

@ -351,3 +351,4 @@
434 n64 pidfd_open sys_pidfd_open
435 n64 clone3 __sys_clone3
437 n64 openat2 sys_openat2
438 n64 pidfd_getfd sys_pidfd_getfd

View File

@ -424,3 +424,4 @@
434 o32 pidfd_open sys_pidfd_open
435 o32 clone3 __sys_clone3
437 o32 openat2 sys_openat2
438 o32 pidfd_getfd sys_pidfd_getfd

View File

@ -434,3 +434,4 @@
434 common pidfd_open sys_pidfd_open
435 common clone3 sys_clone3_wrapper
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -518,3 +518,4 @@
434 common pidfd_open sys_pidfd_open
435 nospu clone3 ppc_clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -439,3 +439,4 @@
434 common pidfd_open sys_pidfd_open sys_pidfd_open
435 common clone3 sys_clone3 sys_clone3
437 common openat2 sys_openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd

View File

@ -439,3 +439,4 @@
434 common pidfd_open sys_pidfd_open
# 435 reserved for clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -482,3 +482,4 @@
434 common pidfd_open sys_pidfd_open
# 435 reserved for clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -441,3 +441,4 @@
434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
435 i386 clone3 sys_clone3 __ia32_sys_clone3
437 i386 openat2 sys_openat2 __ia32_sys_openat2
438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd

View File

@ -358,6 +358,7 @@
434 common pidfd_open __x64_sys_pidfd_open
435 common clone3 __x64_sys_clone3/ptregs
437 common openat2 __x64_sys_openat2
438 common pidfd_getfd __x64_sys_pidfd_getfd
#
# x32-specific system call numbers start at 512 to avoid cache impact

View File

@ -407,3 +407,4 @@
434 common pidfd_open sys_pidfd_open
435 common clone3 sys_clone3
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd

View File

@ -2402,10 +2402,12 @@ static void binder_deferred_fd_close(int fd)
return;
init_task_work(&twcb->twork, binder_do_fd_close);
__close_fd_get_file(fd, &twcb->file);
if (twcb->file)
if (twcb->file) {
filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, true);
else
} else {
kfree(twcb);
}
}
static void binder_transaction_buffer_release(struct binder_proc *proc,

View File

@ -355,12 +355,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi;
}
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
@ -2076,27 +2070,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return do_epoll_create(0);
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
bool nonblock)
{
if (!nonblock) {
mutex_lock_nested(mutex, depth);
return 0;
}
if (mutex_trylock(mutex))
return 0;
return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
@ -2114,7 +2109,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds);
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
@ -2130,11 +2125,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@ -2159,13 +2154,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
mutex_lock_nested(&ep->mtx, 0);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
full_check = 1;
mutex_unlock(&ep->mtx);
mutex_lock(&epmutex);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
@ -2175,10 +2174,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
mutex_lock_nested(&ep->mtx, 0);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error) {
out_del:
list_del(&tf.file->f_tfile_llink);
goto error_tgt_fput;
}
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
mutex_lock_nested(&tep->mtx, 1);
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
if (error) {
mutex_unlock(&ep->mtx);
goto out_del;
}
}
}
}
@ -2194,8 +2202,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
@ -2210,8 +2218,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds.events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, &epds);
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
@ -2233,6 +2241,23 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return error;
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).

View File

@ -642,7 +642,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/*
* variant of __close_fd that gets a ref on the file for later fput
* variant of __close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return filp_close(file, files);
return 0;
out_unlock:
spin_unlock(&files->file_lock);
@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
fmode_t mask, unsigned int refs)
{
struct files_struct *files = current->files;
struct file *file;
rcu_read_lock();
@ -729,6 +731,12 @@ static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
return file;
}
static inline struct file *__fget(unsigned int fd, fmode_t mask,
unsigned int refs)
{
return __fget_files(current->files, fd, mask, refs);
}
struct file *fget_many(unsigned int fd, unsigned int refs)
{
return __fget(fd, FMODE_PATH, refs);
@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd)
}
EXPORT_SYMBOL(fget_raw);
struct file *fget_task(struct task_struct *task, unsigned int fd)
{
struct file *file = NULL;
task_lock(task);
if (task->files)
file = __fget_files(task->files, fd, 0, 1);
task_unlock(task);
return file;
}
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*

View File

@ -122,6 +122,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode);
@ -180,3 +182,9 @@ extern const struct dentry_operations ns_dentry_operations;
/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
int cp_statx(const struct kstat *stat, struct statx __user *buffer);

View File

@ -56,7 +56,8 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
const struct cred *creds;
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
};
@ -109,10 +110,10 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
const struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
refcount_t use_refs;
};
static bool io_worker_get(struct io_worker *worker)
@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
if (worker->creds) {
revert_creds(worker->creds);
worker->creds = NULL;
if (worker->saved_creds) {
revert_creds(worker->saved_creds);
worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
return NULL;
}
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
if (worker->mm) {
unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
if (!work->mm) {
set_fs(KERNEL_DS);
return;
}
if (mmget_not_zero(work->mm)) {
use_mm(work->mm);
if (!worker->mm)
set_fs(USER_DS);
worker->mm = work->mm;
/* hang on to this mm */
work->mm = NULL;
return;
}
/* failed grabbing mm, ensure work gets cancelled */
work->flags |= IO_WQ_WORK_CANCEL;
}
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
const struct cred *old_creds = override_creds(work->creds);
worker->cur_creds = work->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
worker->saved_creds = old_creds;
}
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
@ -438,24 +476,19 @@ static void io_worker_handle_work(struct io_worker *worker)
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
current->files != work->files) {
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
wq->mm) {
if (mmget_not_zero(wq->mm)) {
use_mm(wq->mm);
set_fs(USER_DS);
worker->mm = wq->mm;
} else {
work->flags |= IO_WQ_WORK_CANCEL;
}
}
if (!worker->creds)
worker->creds = override_creds(wq->creds);
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
* the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
/*
@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
if (!atomic_read(&acct->nr_running))
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct);
}
@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return false;
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work == work) {
if (worker->cur_work == work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
wq->creds = data->creds;
for_each_node(node) {
struct io_wqe *wqe;
@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->done);
/* caller must have already done mmgrab() on this mm */
wq->mm = data->mm;
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = -ENOMEM;
goto err;
}
refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done);
return wq;
}
@ -1078,13 +1113,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(ret);
}
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
}
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
wake_up_process(worker->task);
return false;
}
void io_wq_destroy(struct io_wq *wq)
static void __io_wq_destroy(struct io_wq *wq)
{
int node;
@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
kfree(wq->wqes);
kfree(wq);
}
void io_wq_destroy(struct io_wq *wq)
{
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}

View File

@ -7,11 +7,11 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_NEEDS_USER = 8,
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@ -72,6 +72,8 @@ struct io_wq_work {
};
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
unsigned flags;
};
@ -81,21 +83,22 @@ struct io_wq_work {
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
(work)->mm = NULL; \
(work)->creds = NULL; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct mm_struct *mm;
struct user_struct *user;
const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);

File diff suppressed because it is too large Load Diff

View File

@ -970,7 +970,7 @@ EXPORT_SYMBOL(open_with_fake_path);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
static inline struct open_how build_open_how(int flags, umode_t mode)
inline struct open_how build_open_how(int flags, umode_t mode)
{
struct open_how how = {
.flags = flags & VALID_OPEN_FLAGS,
@ -986,8 +986,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode)
return how;
}
static inline int build_open_flags(const struct open_how *how,
struct open_flags *op)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
int flags = how->flags;
int lookup_flags = 0;

View File

@ -21,6 +21,8 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include "internal.h"
/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @inode: Inode to use as the source
@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
}
EXPORT_SYMBOL(vfs_statx_fd);
inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
{
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
return -EINVAL;
*lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
if (flags & AT_SYMLINK_NOFOLLOW)
*lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
*lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
*lookup_flags |= LOOKUP_EMPTY;
return 0;
}
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
{
struct path path;
int error = -EINVAL;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
unsigned lookup_flags;
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
return -EINVAL;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
static noinline_for_stack int
noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp;

View File

@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock);
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
#else
static inline void eventpoll_init_file(struct file *file) {}

View File

@ -16,6 +16,7 @@ extern void fput(struct file *);
extern void fput_many(struct file *, unsigned int);
struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
@ -47,6 +48,7 @@ static inline void fdput(struct fd fd)
extern struct file *fget(unsigned int fd);
extern struct file *fget_many(unsigned int fd, unsigned int refs);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern unsigned long __fdget(unsigned int fd);
extern unsigned long __fdget_raw(unsigned int fd);
extern unsigned long __fdget_pos(unsigned int fd);

View File

@ -2324,6 +2324,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,

View File

@ -497,7 +497,7 @@ struct mm_struct {
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_mm *mmu_notifier_mm;
struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page_table_lock */

View File

@ -8,7 +8,7 @@
#include <linux/srcu.h>
#include <linux/interval_tree.h>
struct mmu_notifier_mm;
struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;
@ -73,7 +73,7 @@ struct mmu_notifier_ops {
* through the gart alias address, so leading to memory
* corruption.
*/
void (*release)(struct mmu_notifier *mn,
void (*release)(struct mmu_notifier *subscription,
struct mm_struct *mm);
/*
@ -85,7 +85,7 @@ struct mmu_notifier_ops {
* Start-end is necessary in case the secondary MMU is mapping the page
* at a smaller granularity than the primary MMU.
*/
int (*clear_flush_young)(struct mmu_notifier *mn,
int (*clear_flush_young)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
@ -95,7 +95,7 @@ struct mmu_notifier_ops {
* latter, it is supposed to test-and-clear the young/accessed bitflag
* in the secondary pte, but it may omit flushing the secondary tlb.
*/
int (*clear_young)(struct mmu_notifier *mn,
int (*clear_young)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
@ -106,7 +106,7 @@ struct mmu_notifier_ops {
* frequently used without actually clearing the flag or tearing
* down the secondary mapping on the page.
*/
int (*test_young)(struct mmu_notifier *mn,
int (*test_young)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long address);
@ -114,7 +114,7 @@ struct mmu_notifier_ops {
* change_pte is called in cases that pte mapping to page is changed:
* for example, when ksm remaps pte to point to a new shared page.
*/
void (*change_pte)(struct mmu_notifier *mn,
void (*change_pte)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long address,
pte_t pte);
@ -169,9 +169,9 @@ struct mmu_notifier_ops {
* invalidate_range_end.
*
*/
int (*invalidate_range_start)(struct mmu_notifier *mn,
int (*invalidate_range_start)(struct mmu_notifier *subscription,
const struct mmu_notifier_range *range);
void (*invalidate_range_end)(struct mmu_notifier *mn,
void (*invalidate_range_end)(struct mmu_notifier *subscription,
const struct mmu_notifier_range *range);
/*
@ -192,8 +192,10 @@ struct mmu_notifier_ops {
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);
void (*invalidate_range)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
/*
* These callbacks are used with the get/put interface to manage the
@ -206,7 +208,7 @@ struct mmu_notifier_ops {
* and cannot sleep.
*/
struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
void (*free_notifier)(struct mmu_notifier *mn);
void (*free_notifier)(struct mmu_notifier *subscription);
};
/*
@ -235,7 +237,7 @@ struct mmu_notifier {
* was required but mmu_notifier_range_blockable(range) is false.
*/
struct mmu_interval_notifier_ops {
bool (*invalidate)(struct mmu_interval_notifier *mni,
bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
};
@ -265,7 +267,7 @@ struct mmu_notifier_range {
static inline int mm_has_notifiers(struct mm_struct *mm)
{
return unlikely(mm->mmu_notifier_mm);
return unlikely(mm->notifier_subscriptions);
}
struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
@ -280,30 +282,31 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
up_write(&mm->mmap_sem);
return ret;
}
void mmu_notifier_put(struct mmu_notifier *mn);
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);
extern int mmu_notifier_register(struct mmu_notifier *mn,
extern int mmu_notifier_register(struct mmu_notifier *subscription,
struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *mn,
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
struct mm_struct *mm);
unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
struct mm_struct *mm, unsigned long start,
unsigned long length,
const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
struct mmu_interval_notifier *mni, struct mm_struct *mm,
struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
unsigned long start, unsigned long length,
const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);
/**
* mmu_interval_set_seq - Save the invalidation sequence
* @mni - The mni passed to invalidate
* @interval_sub - The subscription passed to invalidate
* @cur_seq - The cur_seq passed to the invalidate() callback
*
* This must be called unconditionally from the invalidate callback of a
@ -314,15 +317,16 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
* If the caller does not call mmu_interval_read_begin() or
* mmu_interval_read_retry() then this call is not required.
*/
static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
unsigned long cur_seq)
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
unsigned long cur_seq)
{
WRITE_ONCE(mni->invalidate_seq, cur_seq);
WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}
/**
* mmu_interval_read_retry - End a read side critical section against a VA range
* mni: The range
* interval_sub: The subscription
* seq: The return of the paired mmu_interval_read_begin()
*
* This MUST be called under a user provided lock that is also held
@ -334,15 +338,16 @@ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
* Returns true if an invalidation collided with this critical section, and
* the caller should retry.
*/
static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
unsigned long seq)
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
unsigned long seq)
{
return mni->invalidate_seq != seq;
return interval_sub->invalidate_seq != seq;
}
/**
* mmu_interval_check_retry - Test if a collision has occurred
* mni: The range
* interval_sub: The subscription
* seq: The return of the matching mmu_interval_read_begin()
*
* This can be used in the critical section between mmu_interval_read_begin()
@ -357,14 +362,15 @@ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
* This call can be used as part of loops and other expensive operations to
* expedite a retry.
*/
static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni,
unsigned long seq)
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
unsigned long seq)
{
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
return READ_ONCE(mni->invalidate_seq) != seq;
return READ_ONCE(interval_sub->invalidate_seq) != seq;
}
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
@ -480,15 +486,15 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
__mmu_notifier_invalidate_range(mm, start, end);
}
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
mm->mmu_notifier_mm = NULL;
mm->notifier_subscriptions = NULL;
}
static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
if (mm_has_notifiers(mm))
__mmu_notifier_mm_destroy(mm);
__mmu_notifier_subscriptions_destroy(mm);
}
@ -692,11 +698,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}
static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

View File

@ -209,6 +209,36 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
percpu_ref_get_many(ref, 1);
}
/**
* percpu_ref_tryget_many - try to increment a percpu refcount
* @ref: percpu_ref to try-get
* @nr: number of references to get
*
* Increment a percpu refcount by @nr unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
unsigned long nr)
{
unsigned long __percpu *percpu_count;
bool ret;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_add(*percpu_count, nr);
ret = true;
} else {
ret = atomic_long_add_unless(&ref->count, nr, 0);
}
rcu_read_unlock();
return ret;
}
/**
* percpu_ref_tryget - try to increment a percpu refcount
* @ref: percpu_ref to try-get
@ -220,21 +250,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count;
bool ret;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
ret = true;
} else {
ret = atomic_long_inc_not_zero(&ref->count);
}
rcu_read_unlock();
return ret;
return percpu_ref_tryget_many(ref, 1);
}
/**

View File

@ -921,7 +921,7 @@ struct task_struct {
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct *sighand;
struct sighand_struct __rcu *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */

View File

@ -1002,6 +1002,7 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
siginfo_t __user *info,
unsigned int flags);
asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
/*
* Architecture-specific system calls

View File

@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @opcode: opcode of request
* @user_data: user data associated with the request
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
bool sq_thread),
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( bool, force_nonblock )
__field( bool, sq_thread )
@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, (unsigned long long) __entry->user_data,
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
);

View File

@ -853,9 +853,11 @@ __SYSCALL(__NR_clone3, sys_clone3)
#define __NR_openat2 437
__SYSCALL(__NR_openat2, sys_openat2)
#define __NR_pidfd_getfd 438
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#undef __NR_syscalls
#define __NR_syscalls 438
#define __NR_syscalls 439
/*
* 32 bit systems traditionally used different

View File

@ -301,6 +301,7 @@ struct vfs_ns_cap_data {
/* Allow more than 64hz interrupts from the real-time clock */
/* Override max number of consoles on console allocation */
/* Override max number of keymaps */
/* Control memory reclaim behavior */
#define CAP_SYS_RESOURCE 24

View File

@ -34,21 +34,43 @@ struct io_uring_sqe {
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
};
__u64 user_data; /* data to be passed back at completion time */
union {
__u16 buf_index; /* index into fixed buffers, if used */
struct {
/* index into fixed buffers, if used */
__u16 buf_index;
/* personality to use, if used */
__u16 personality;
};
__u64 __pad2[3];
};
};
enum {
IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT,
IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
};
/*
* sqe->flags
*/
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
/* use fixed fileset */
#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
/* issue after inflight IO */
#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
/* links next sqe */
#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
/* like LINK, but stronger */
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/*
* io_uring_setup() flags
@ -57,6 +79,8 @@ struct io_uring_sqe {
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
enum {
IORING_OP_NOP,
@ -76,6 +100,19 @@ enum {
IORING_OP_ASYNC_CANCEL,
IORING_OP_LINK_TIMEOUT,
IORING_OP_CONNECT,
IORING_OP_FALLOCATE,
IORING_OP_OPENAT,
IORING_OP_CLOSE,
IORING_OP_FILES_UPDATE,
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
IORING_OP_FADVISE,
IORING_OP_MADVISE,
IORING_OP_SEND,
IORING_OP_RECV,
IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL,
/* this goes last, obviously */
IORING_OP_LAST,
@ -153,7 +190,8 @@ struct io_uring_params {
__u32 sq_thread_cpu;
__u32 sq_thread_idle;
__u32 features;
__u32 resv[4];
__u32 wq_fd;
__u32 resv[3];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};
@ -164,6 +202,8 @@ struct io_uring_params {
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1)
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
/*
* io_uring_register(2) opcodes and arguments
@ -175,6 +215,10 @@ struct io_uring_params {
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
#define IORING_REGISTER_FILES_UPDATE 6
#define IORING_REGISTER_EVENTFD_ASYNC 7
#define IORING_REGISTER_PROBE 8
#define IORING_REGISTER_PERSONALITY 9
#define IORING_UNREGISTER_PERSONALITY 10
struct io_uring_files_update {
__u32 offset;
@ -182,4 +226,21 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
__u8 op;
__u8 resv;
__u16 flags; /* IO_URING_OP_* flags */
__u32 resv2;
};
struct io_uring_probe {
__u8 last_op; /* last opcode supported */
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
struct io_uring_probe_op ops[0];
};
#endif

View File

@ -234,6 +234,10 @@ struct prctl_mm_map {
#define PR_GET_TAGGED_ADDR_CTRL 56
# define PR_TAGGED_ADDR_ENABLE (1UL << 0)
/* Control reclaim behavior when allocating memory */
#define PR_SET_IO_FLUSHER 57
#define PR_GET_IO_FLUSHER 58
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0

View File

@ -697,7 +697,7 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
free_mm(mm);
@ -1036,7 +1036,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_aio(mm);
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;

View File

@ -578,3 +578,93 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
int ret;
ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
if (ret)
return ERR_PTR(ret);
if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
file = fget_task(task, fd);
else
file = ERR_PTR(-EPERM);
mutex_unlock(&task->signal->cred_guard_mutex);
return file ?: ERR_PTR(-EBADF);
}
static int pidfd_getfd(struct pid *pid, int fd)
{
struct task_struct *task;
struct file *file;
int ret;
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
file = __pidfd_fget(task, fd);
put_task_struct(task);
if (IS_ERR(file))
return PTR_ERR(file);
ret = security_file_receive(file);
if (ret) {
fput(file);
return ret;
}
ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
fput(file);
else
fd_install(ret, file);
return ret;
}
/**
* sys_pidfd_getfd() - Get a file descriptor from another process
*
* @pidfd: the pidfd file descriptor of the process
* @fd: the file descriptor number to get
* @flags: flags on how to get the fd (reserved)
*
* This syscall gets a copy of a file descriptor from another process
* based on the pidfd, and file descriptor number. It requires that
* the calling process has the ability to ptrace the process represented
* by the pidfd. The process which is having its file descriptor copied
* is otherwise unaffected.
*
* Return: On success, a cloexec file descriptor is returned.
* On error, a negative errno number will be returned.
*/
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
struct fd f;
int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
f = fdget(pidfd);
if (!f.file)
return -EBADF;
pid = pidfd_pid(f.file);
if (IS_ERR(pid))
ret = PTR_ERR(pid);
else
ret = pidfd_getfd(pid, fd);
fdput(f);
return ret;
}

View File

@ -1383,7 +1383,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
* must see ->sighand == NULL.
*/
spin_lock_irqsave(&sighand->siglock, *flags);
if (likely(sighand == tsk->sighand))
if (likely(sighand == rcu_access_pointer(tsk->sighand)))
break;
spin_unlock_irqrestore(&sighand->siglock, *flags);
}

View File

@ -2410,6 +2410,8 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@ -2640,6 +2642,29 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = GET_TAGGED_ADDR_CTRL();
break;
case PR_SET_IO_FLUSHER:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
if (arg3 || arg4 || arg5)
return -EINVAL;
if (arg2 == 1)
current->flags |= PR_IO_FLUSHER;
else if (!arg2)
current->flags &= ~PR_IO_FLUSHER;
else
return -EINVAL;
break;
case PR_GET_IO_FLUSHER:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
default:
error = -EINVAL;
break;

View File

@ -153,7 +153,7 @@ void dump_mm(const struct mm_struct *mm)
#endif
"exe_file %px\n"
#ifdef CONFIG_MMU_NOTIFIER
"mmu_notifier_mm %px\n"
"notifier_subscriptions %px\n"
#endif
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
@ -185,7 +185,7 @@ void dump_mm(const struct mm_struct *mm)
#endif
mm->exe_file,
#ifdef CONFIG_MMU_NOTIFIER
mm->mmu_notifier_mm,
mm->notifier_subscriptions,
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,

View File

@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int do_madvise(unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(start, len_in, behavior);
}

File diff suppressed because it is too large Load Diff

View File

@ -2,3 +2,4 @@ pidfd_open_test
pidfd_poll_test
pidfd_test
pidfd_wait
pidfd_getfd_test

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -g -I../../../../usr/include/ -pthread
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test
include ../lib.mk

View File

@ -36,6 +36,10 @@
#define __NR_clone3 -1
#endif
#ifndef __NR_pidfd_getfd
#define __NR_pidfd_getfd -1
#endif
/*
* The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
* That means, when it wraps around any pid < 300 will be skipped.
@ -84,4 +88,9 @@ static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
}
static inline int sys_pidfd_getfd(int pidfd, int fd, int flags)
{
return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
}
#endif /* __PIDFD_H */

View File

@ -0,0 +1,249 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/socket.h>
#include <linux/kcmp.h>
#include "pidfd.h"
#include "../kselftest.h"
#include "../kselftest_harness.h"
/*
* UNKNOWN_FD is an fd number that should never exist in the child, as it is
* used to check the negative case.
*/
#define UNKNOWN_FD 111
#define UID_NOBODY 65535
static int sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1,
unsigned long idx2)
{
return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2);
}
static int sys_memfd_create(const char *name, unsigned int flags)
{
return syscall(__NR_memfd_create, name, flags);
}
static int __child(int sk, int memfd)
{
int ret;
char buf;
/*
* Ensure we don't leave around a bunch of orphaned children if our
* tests fail.
*/
ret = prctl(PR_SET_PDEATHSIG, SIGKILL);
if (ret) {
fprintf(stderr, "%s: Child could not set DEATHSIG\n",
strerror(errno));
return -1;
}
ret = send(sk, &memfd, sizeof(memfd), 0);
if (ret != sizeof(memfd)) {
fprintf(stderr, "%s: Child failed to send fd number\n",
strerror(errno));
return -1;
}
/*
* The fixture setup is completed at this point. The tests will run.
*
* This blocking recv enables the parent to message the child.
* Either we will read 'P' off of the sk, indicating that we need
* to disable ptrace, or we will read a 0, indicating that the other
* side has closed the sk. This occurs during fixture teardown time,
* indicating that the child should exit.
*/
while ((ret = recv(sk, &buf, sizeof(buf), 0)) > 0) {
if (buf == 'P') {
ret = prctl(PR_SET_DUMPABLE, 0);
if (ret < 0) {
fprintf(stderr,
"%s: Child failed to disable ptrace\n",
strerror(errno));
return -1;
}
} else {
fprintf(stderr, "Child received unknown command %c\n",
buf);
return -1;
}
ret = send(sk, &buf, sizeof(buf), 0);
if (ret != 1) {
fprintf(stderr, "%s: Child failed to ack\n",
strerror(errno));
return -1;
}
}
if (ret < 0) {
fprintf(stderr, "%s: Child failed to read from socket\n",
strerror(errno));
return -1;
}
return 0;
}
static int child(int sk)
{
int memfd, ret;
memfd = sys_memfd_create("test", 0);
if (memfd < 0) {
fprintf(stderr, "%s: Child could not create memfd\n",
strerror(errno));
ret = -1;
} else {
ret = __child(sk, memfd);
close(memfd);
}
close(sk);
return ret;
}
FIXTURE(child)
{
/*
* remote_fd is the number of the FD which we are trying to retrieve
* from the child.
*/
int remote_fd;
/* pid points to the child which we are fetching FDs from */
pid_t pid;
/* pidfd is the pidfd of the child */
int pidfd;
/*
* sk is our side of the socketpair used to communicate with the child.
* When it is closed, the child will exit.
*/
int sk;
};
FIXTURE_SETUP(child)
{
int ret, sk_pair[2];
ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
TH_LOG("%s: failed to create socketpair", strerror(errno));
}
self->sk = sk_pair[0];
self->pid = fork();
ASSERT_GE(self->pid, 0);
if (self->pid == 0) {
close(sk_pair[0]);
if (child(sk_pair[1]))
_exit(EXIT_FAILURE);
_exit(EXIT_SUCCESS);
}
close(sk_pair[1]);
self->pidfd = sys_pidfd_open(self->pid, 0);
ASSERT_GE(self->pidfd, 0);
/*
* Wait for the child to complete setup. It'll send the remote memfd's
* number when ready.
*/
ret = recv(sk_pair[0], &self->remote_fd, sizeof(self->remote_fd), 0);
ASSERT_EQ(sizeof(self->remote_fd), ret);
}
FIXTURE_TEARDOWN(child)
{
EXPECT_EQ(0, close(self->pidfd));
EXPECT_EQ(0, close(self->sk));
EXPECT_EQ(0, wait_for_pid(self->pid));
}
TEST_F(child, disable_ptrace)
{
int uid, fd;
char c;
/*
* Turn into nobody if we're root, to avoid CAP_SYS_PTRACE
*
* The tests should run in their own process, so even this test fails,
* it shouldn't result in subsequent tests failing.
*/
uid = getuid();
if (uid == 0)
ASSERT_EQ(0, seteuid(UID_NOBODY));
ASSERT_EQ(1, send(self->sk, "P", 1, 0));
ASSERT_EQ(1, recv(self->sk, &c, 1, 0));
fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
EXPECT_EQ(-1, fd);
EXPECT_EQ(EPERM, errno);
if (uid == 0)
ASSERT_EQ(0, seteuid(0));
}
TEST_F(child, fetch_fd)
{
int fd, ret;
fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
ASSERT_GE(fd, 0);
EXPECT_EQ(0, sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd));
ret = fcntl(fd, F_GETFD);
ASSERT_GE(ret, 0);
EXPECT_GE(ret & FD_CLOEXEC, 0);
close(fd);
}
TEST_F(child, test_unknown_fd)
{
int fd;
fd = sys_pidfd_getfd(self->pidfd, UNKNOWN_FD, 0);
EXPECT_EQ(-1, fd) {
TH_LOG("getfd succeeded while fetching unknown fd");
};
EXPECT_EQ(EBADF, errno) {
TH_LOG("%s: getfd did not get EBADF", strerror(errno));
}
}
TEST(flags_set)
{
ASSERT_EQ(-1, sys_pidfd_getfd(0, 0, 1));
EXPECT_EQ(errno, EINVAL);
}
#if __NR_pidfd_getfd == -1
int main(void)
{
fprintf(stderr, "__NR_pidfd_getfd undefined. The pidfd_getfd syscall is unavailable. Test aborting\n");
return KSFT_SKIP;
}
#else
TEST_HARNESS_MAIN
#endif