io_uring: import 5.15-stable io_uring

No upstream commit exists.

This imports the io_uring codebase from 5.15.85, wholesale. Changes
from that code base:

- Drop IOCB_ALLOC_CACHE, we don't have that in 5.10.
- Drop MKDIRAT/SYMLINKAT/LINKAT. Would require further VFS backports,
  and we don't support these in 5.10 to begin with.
- sock_from_file() old style calling convention.
- Use compat_get_bitmap() only for CONFIG_COMPAT=y

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Jens Axboe 2022-12-22 14:30:11 -07:00 committed by Greg Kroah-Hartman
parent ed30050329
commit 788d082426
15 changed files with 6855 additions and 5622 deletions

View File

@ -1128,7 +1128,7 @@ export MODORDER := $(extmod-prefix)modules.order
export MODULES_NSDEPS := $(extmod-prefix)modules.nsdeps export MODULES_NSDEPS := $(extmod-prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),) ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ io_uring/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \

View File

@ -32,8 +32,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FS_VERITY) += verity/

1242
fs/io-wq.c

File diff suppressed because it is too large Load Diff

View File

@ -5,50 +5,20 @@
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/xarray.h> #include <linux/xarray.h>
struct io_identity {
struct files_struct *files;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *creds;
struct nsproxy *nsproxy;
struct fs_struct *fs;
unsigned long fsize;
#ifdef CONFIG_AUDIT
kuid_t loginuid;
unsigned int sessionid;
#endif
refcount_t count;
};
struct io_uring_task {
/* submission side */
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
};
#if defined(CONFIG_IO_URING) #if defined(CONFIG_IO_URING)
struct sock *io_uring_get_socket(struct file *file); struct sock *io_uring_get_socket(struct file *file);
void __io_uring_task_cancel(void); void __io_uring_cancel(bool cancel_all);
void __io_uring_files_cancel(struct files_struct *files);
void __io_uring_free(struct task_struct *tsk); void __io_uring_free(struct task_struct *tsk);
static inline void io_uring_files_cancel(void)
{
if (current->io_uring)
__io_uring_cancel(false);
}
static inline void io_uring_task_cancel(void) static inline void io_uring_task_cancel(void)
{ {
if (current->io_uring && !xa_empty(&current->io_uring->xa)) if (current->io_uring)
__io_uring_task_cancel(); __io_uring_cancel(true);
}
static inline void io_uring_files_cancel(struct files_struct *files)
{
if (current->io_uring && !xa_empty(&current->io_uring->xa))
__io_uring_files_cancel(files);
} }
static inline void io_uring_free(struct task_struct *tsk) static inline void io_uring_free(struct task_struct *tsk)
{ {
@ -63,7 +33,7 @@ static inline struct sock *io_uring_get_socket(struct file *file)
static inline void io_uring_task_cancel(void) static inline void io_uring_task_cancel(void)
{ {
} }
static inline void io_uring_files_cancel(struct files_struct *files) static inline void io_uring_files_cancel(void)
{ {
} }
static inline void io_uring_free(struct task_struct *tsk) static inline void io_uring_free(struct task_struct *tsk)

View File

@ -885,6 +885,9 @@ struct task_struct {
/* CLONE_CHILD_CLEARTID: */ /* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid; int __user *clear_child_tid;
/* PF_IO_WORKER */
void *pf_io_worker;
u64 utime; u64 utime;
u64 stime; u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME

View File

@ -341,7 +341,7 @@ asmlinkage long sys_io_uring_setup(u32 entries,
struct io_uring_params __user *p); struct io_uring_params __user *p);
asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
u32 min_complete, u32 flags, u32 min_complete, u32 flags,
const sigset_t __user *sig, size_t sigsz); const void __user *argp, size_t argsz);
asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
void __user *arg, unsigned int nr_args); void __user *arg, unsigned int nr_args);

View File

@ -12,11 +12,11 @@ struct io_wq_work;
/** /**
* io_uring_create - called after a new io_uring context was prepared * io_uring_create - called after a new io_uring context was prepared
* *
* @fd: corresponding file descriptor * @fd: corresponding file descriptor
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @sq_entries: actual SQ size * @sq_entries: actual SQ size
* @cq_entries: actual CQ size * @cq_entries: actual CQ size
* @flags: SQ ring flags, provided to io_uring_setup(2) * @flags: SQ ring flags, provided to io_uring_setup(2)
* *
* Allows to trace io_uring creation and provide pointer to a context, that can * Allows to trace io_uring creation and provide pointer to a context, that can
* be used later to find correlated events. * be used later to find correlated events.
@ -49,15 +49,15 @@ TRACE_EVENT(io_uring_create,
); );
/** /**
* io_uring_register - called after a buffer/file/eventfd was succesfully * io_uring_register - called after a buffer/file/eventfd was successfully
* registered for a ring * registered for a ring
* *
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @opcode: describes which operation to perform * @opcode: describes which operation to perform
* @nr_user_files: number of registered files * @nr_user_files: number of registered files
* @nr_user_bufs: number of registered buffers * @nr_user_bufs: number of registered buffers
* @cq_ev_fd: whether eventfs registered or not * @cq_ev_fd: whether eventfs registered or not
* @ret: return code * @ret: return code
* *
* Allows to trace fixed files/buffers/eventfds, that could be registered to * Allows to trace fixed files/buffers/eventfds, that could be registered to
* avoid an overhead of getting references to them for every operation. This * avoid an overhead of getting references to them for every operation. This
@ -142,16 +142,16 @@ TRACE_EVENT(io_uring_queue_async_work,
TP_ARGS(ctx, rw, req, work, flags), TP_ARGS(ctx, rw, req, work, flags),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( int, rw ) __field( int, rw )
__field( void *, req ) __field( void *, req )
__field( struct io_wq_work *, work ) __field( struct io_wq_work *, work )
__field( unsigned int, flags ) __field( unsigned int, flags )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->rw = rw; __entry->rw = rw;
__entry->req = req; __entry->req = req;
__entry->work = work; __entry->work = work;
__entry->flags = flags; __entry->flags = flags;
@ -196,10 +196,10 @@ TRACE_EVENT(io_uring_defer,
/** /**
* io_uring_link - called before the io_uring request added into link_list of * io_uring_link - called before the io_uring request added into link_list of
* another request * another request
* *
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @req: pointer to a linked request * @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req * @target_req: pointer to a previous request, that would contain @req
* *
* Allows to track linked requests, to understand dependencies between requests * Allows to track linked requests, to understand dependencies between requests
@ -212,8 +212,8 @@ TRACE_EVENT(io_uring_link,
TP_ARGS(ctx, req, target_req), TP_ARGS(ctx, req, target_req),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req ) __field( void *, req )
__field( void *, target_req ) __field( void *, target_req )
), ),
@ -244,7 +244,7 @@ TRACE_EVENT(io_uring_cqring_wait,
TP_ARGS(ctx, min_events), TP_ARGS(ctx, min_events),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( int, min_events ) __field( int, min_events )
), ),
@ -272,7 +272,7 @@ TRACE_EVENT(io_uring_fail_link,
TP_ARGS(req, link), TP_ARGS(req, link),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, req ) __field( void *, req )
__field( void *, link ) __field( void *, link )
), ),
@ -290,38 +290,42 @@ TRACE_EVENT(io_uring_fail_link,
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @user_data: user data associated with the request * @user_data: user data associated with the request
* @res: result of the request * @res: result of the request
* @cflags: completion flags
* *
*/ */
TRACE_EVENT(io_uring_complete, TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, long res), TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
TP_ARGS(ctx, user_data, res), TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( u64, user_data ) __field( u64, user_data )
__field( long, res ) __field( int, res )
__field( unsigned, cflags )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->user_data = user_data; __entry->user_data = user_data;
__entry->res = res; __entry->res = res;
__entry->cflags = cflags;
), ),
TP_printk("ring %p, user_data 0x%llx, result %ld", TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data, __entry->ctx, (unsigned long long)__entry->user_data,
__entry->res) __entry->res, __entry->cflags)
); );
/** /**
* io_uring_submit_sqe - called before submitting one SQE * io_uring_submit_sqe - called before submitting one SQE
* *
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @opcode: opcode of request * @opcode: opcode of request
* @user_data: user data associated with the request * @user_data: user data associated with the request
* @flags request flags
* @force_nonblock: whether a context blocking or not * @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE * @sq_thread: true if sq_thread has submitted this SQE
* *
@ -330,41 +334,60 @@ TRACE_EVENT(io_uring_complete,
*/ */
TRACE_EVENT(io_uring_submit_sqe, TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock, TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data, u32 flags,
bool sq_thread), bool force_nonblock, bool sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread), TP_ARGS(ctx, req, opcode, user_data, flags, force_nonblock, sq_thread),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req )
__field( u8, opcode ) __field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
__field( u32, flags )
__field( bool, force_nonblock ) __field( bool, force_nonblock )
__field( bool, sq_thread ) __field( bool, sq_thread )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode; __entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
__entry->flags = flags;
__entry->force_nonblock = force_nonblock; __entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread; __entry->sq_thread = sq_thread;
), ),
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d", TP_printk("ring %p, req %p, op %d, data 0x%llx, flags %u, "
__entry->ctx, __entry->opcode, "non block %d, sq_thread %d", __entry->ctx, __entry->req,
(unsigned long long) __entry->user_data, __entry->opcode, (unsigned long long)__entry->user_data,
__entry->force_nonblock, __entry->sq_thread) __entry->flags, __entry->force_nonblock, __entry->sq_thread)
); );
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
* @mask: request poll events mask
* @events: registered events of interest
*
* Allows to track which fds are waiting for and what are the events of
* interest.
*/
TRACE_EVENT(io_uring_poll_arm, TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data,
int mask, int events),
TP_ARGS(ctx, opcode, user_data, mask, events), TP_ARGS(ctx, req, opcode, user_data, mask, events),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req )
__field( u8, opcode ) __field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
__field( int, mask ) __field( int, mask )
@ -373,16 +396,17 @@ TRACE_EVENT(io_uring_poll_arm,
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode; __entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
__entry->mask = mask; __entry->mask = mask;
__entry->events = events; __entry->events = events;
), ),
TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", TP_printk("ring %p, req %p, op %d, data 0x%llx, mask 0x%x, events 0x%x",
__entry->ctx, __entry->opcode, __entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data, (unsigned long long) __entry->user_data,
__entry->mask, __entry->events) __entry->mask, __entry->events)
); );
TRACE_EVENT(io_uring_poll_wake, TRACE_EVENT(io_uring_poll_wake,
@ -437,27 +461,40 @@ TRACE_EVENT(io_uring_task_add,
__entry->mask) __entry->mask)
); );
/*
* io_uring_task_run - called when task_work_run() executes the poll events
* notification callbacks
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @opcode: opcode of request
* @user_data: user data associated with the request
*
* Allows to track when notified poll events are processed
*/
TRACE_EVENT(io_uring_task_run, TRACE_EVENT(io_uring_task_run,
TP_PROTO(void *ctx, u8 opcode, u64 user_data), TP_PROTO(void *ctx, void *req, u8 opcode, u64 user_data),
TP_ARGS(ctx, opcode, user_data), TP_ARGS(ctx, req, opcode, user_data),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( void *, req )
__field( u8, opcode ) __field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
), ),
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->req = req;
__entry->opcode = opcode; __entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
), ),
TP_printk("ring %p, op %d, data 0x%llx", TP_printk("ring %p, req %p, op %d, data 0x%llx",
__entry->ctx, __entry->opcode, __entry->ctx, __entry->req, __entry->opcode,
(unsigned long long) __entry->user_data) (unsigned long long) __entry->user_data)
); );
#endif /* _TRACE_IO_URING_H */ #endif /* _TRACE_IO_URING_H */

View File

@ -42,23 +42,25 @@ struct io_uring_sqe {
__u32 statx_flags; __u32 statx_flags;
__u32 fadvise_advice; __u32 fadvise_advice;
__u32 splice_flags; __u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
union { union {
struct { /* index into fixed buffers, if used */
/* pack this to avoid bogus arm OABI complaints */ __u16 buf_index;
union { /* for grouped buffer selection */
/* index into fixed buffers, if used */ __u16 buf_group;
__u16 buf_index; } __attribute__((packed));
/* for grouped buffer selection */ /* personality to use, if used */
__u16 buf_group; __u16 personality;
} __attribute__((packed)); union {
/* personality to use, if used */ __s32 splice_fd_in;
__u16 personality; __u32 file_index;
__s32 splice_fd_in;
};
__u64 __pad2[3];
}; };
__u64 __pad2[2];
}; };
enum { enum {
@ -132,6 +134,9 @@ enum {
IORING_OP_PROVIDE_BUFFERS, IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS, IORING_OP_REMOVE_BUFFERS,
IORING_OP_TEE, IORING_OP_TEE,
IORING_OP_SHUTDOWN,
IORING_OP_RENAMEAT,
IORING_OP_UNLINKAT,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
@ -145,14 +150,34 @@ enum {
/* /*
* sqe->timeout_flags * sqe->timeout_flags
*/ */
#define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_ABS (1U << 0)
#define IORING_TIMEOUT_UPDATE (1U << 1)
#define IORING_TIMEOUT_BOOTTIME (1U << 2)
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/* /*
* sqe->splice_flags * sqe->splice_flags
* extends splice(2) flags * extends splice(2) flags
*/ */
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
/*
* POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
* command flags for POLL_ADD are stored in sqe->len.
*
* IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
* the poll handler will continue to report
* CQEs on behalf of the same SQE.
*
* IORING_POLL_UPDATE Update existing poll request, matching
* sqe->addr as the old user_data field.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
*/ */
@ -166,8 +191,10 @@ struct io_uring_cqe {
* cqe->flags * cqe->flags
* *
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
*/ */
#define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
enum { enum {
IORING_CQE_BUFFER_SHIFT = 16, IORING_CQE_BUFFER_SHIFT = 16,
@ -226,6 +253,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_GETEVENTS (1U << 0) #define IORING_ENTER_GETEVENTS (1U << 0)
#define IORING_ENTER_SQ_WAKEUP (1U << 1) #define IORING_ENTER_SQ_WAKEUP (1U << 1)
#define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
/* /*
* Passed in for io_uring_setup(2). Copied back with updated info on success * Passed in for io_uring_setup(2). Copied back with updated info on success
@ -253,6 +281,10 @@ struct io_uring_params {
#define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5) #define IORING_FEAT_FAST_POLL (1U << 5)
#define IORING_FEAT_POLL_32BITS (1U << 6) #define IORING_FEAT_POLL_32BITS (1U << 6)
#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
#define IORING_FEAT_EXT_ARG (1U << 8)
#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
#define IORING_FEAT_RSRC_TAGS (1U << 10)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
@ -272,16 +304,62 @@ enum {
IORING_REGISTER_RESTRICTIONS = 11, IORING_REGISTER_RESTRICTIONS = 11,
IORING_REGISTER_ENABLE_RINGS = 12, IORING_REGISTER_ENABLE_RINGS = 12,
/* extended with tagging */
IORING_REGISTER_FILES2 = 13,
IORING_REGISTER_FILES_UPDATE2 = 14,
IORING_REGISTER_BUFFERS2 = 15,
IORING_REGISTER_BUFFERS_UPDATE = 16,
/* set/clear io-wq thread affinities */
IORING_REGISTER_IOWQ_AFF = 17,
IORING_UNREGISTER_IOWQ_AFF = 18,
/* set/get max number of io-wq workers */
IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST IORING_REGISTER_LAST
}; };
/* io-wq worker categories */
enum {
IO_WQ_BOUND,
IO_WQ_UNBOUND,
};
/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update { struct io_uring_files_update {
__u32 offset; __u32 offset;
__u32 resv; __u32 resv;
__aligned_u64 /* __s32 * */ fds; __aligned_u64 /* __s32 * */ fds;
}; };
struct io_uring_rsrc_register {
__u32 nr;
__u32 resv;
__u64 resv2;
__aligned_u64 data;
__aligned_u64 tags;
};
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
__aligned_u64 data;
};
struct io_uring_rsrc_update2 {
__u32 offset;
__u32 resv;
__aligned_u64 data;
__aligned_u64 tags;
__u32 nr;
__u32 resv2;
};
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
#define IO_URING_OP_SUPPORTED (1U << 0) #define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op { struct io_uring_probe_op {
@ -329,4 +407,11 @@ enum {
IORING_RESTRICTION_LAST IORING_RESTRICTION_LAST
}; };
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u64 ts;
};
#endif #endif

6
io_uring/Makefile Normal file
View File

@ -0,0 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for io_uring
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o

1398
io_uring/io-wq.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
#ifndef INTERNAL_IO_WQ_H #ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H
#include <linux/io_uring.h> #include <linux/refcount.h>
struct io_wq; struct io_wq;
@ -9,16 +9,8 @@ enum {
IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2, IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4, IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16, IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
IO_WQ_WORK_FS = 64,
IO_WQ_WORK_MM = 128,
IO_WQ_WORK_CREDS = 256,
IO_WQ_WORK_BLKCG = 512,
IO_WQ_WORK_FSIZE = 1024,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
}; };
@ -52,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
static inline void wq_list_add_tail(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list) struct io_wq_work_list *list)
{ {
node->next = NULL;
if (!list->first) { if (!list->first) {
list->last = node; list->last = node;
WRITE_ONCE(list->first, node); WRITE_ONCE(list->first, node);
@ -59,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
list->last->next = node; list->last->next = node;
list->last = node; list->last = node;
} }
node->next = NULL;
} }
static inline void wq_list_cut(struct io_wq_work_list *list, static inline void wq_list_cut(struct io_wq_work_list *list,
@ -95,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work { struct io_wq_work {
struct io_wq_work_node list; struct io_wq_work_node list;
struct io_identity *identity;
unsigned flags; unsigned flags;
}; };
@ -107,37 +98,48 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list); return container_of(work->list.next, struct io_wq_work, list);
} }
typedef void (free_work_fn)(struct io_wq_work *); typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data { struct io_wq_data {
struct user_struct *user; struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work; io_wq_work_fn *do_work;
free_work_fn *free_work; free_work_fn *free_work;
}; };
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_exit_start(struct io_wq *wq);
void io_wq_destroy(struct io_wq *wq); void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work) static inline bool io_wq_is_hashed(struct io_wq_work *work)
{ {
return work->flags & IO_WQ_WORK_HASHED; return work->flags & IO_WQ_WORK_HASHED;
} }
void io_wq_cancel_all(struct io_wq *wq);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *); typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all); void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
#if defined(CONFIG_IO_WQ) #if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *); extern void io_wq_worker_running(struct task_struct *);
@ -152,6 +154,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void) static inline bool io_wq_current_is_worker(void)
{ {
return in_task() && (current->flags & PF_IO_WORKER); return in_task() && (current->flags & PF_IO_WORKER) &&
current->pf_io_worker;
} }
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -763,7 +763,7 @@ void __noreturn do_exit(long code)
schedule(); schedule();
} }
io_uring_files_cancel(tsk->files); io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */ exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */ /* sync mm's RSS info before statistics gathering */

View File

@ -926,6 +926,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL; tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL; tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL; tsk->wake_q.next = NULL;
tsk->pf_io_worker = NULL;
account_kernel_stack(tsk, 1); account_kernel_stack(tsk, 1);

View File

@ -21,7 +21,7 @@
#include <asm/tlb.h> #include <asm/tlb.h>
#include "../workqueue_internal.h" #include "../workqueue_internal.h"
#include "../../fs/io-wq.h" #include "../../io_uring/io-wq.h"
#include "../smpboot.h" #include "../smpboot.h"
#include "pelt.h" #include "pelt.h"