#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
#include <linux/net.h>
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
+
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
struct fixed_rsrc_ref_node *node;
struct percpu_ref refs;
struct completion done;
+ bool quiesce;
};
struct io_buffer {
bool registered;
};
+enum {
+ IO_SQ_THREAD_SHOULD_STOP = 0,
+ IO_SQ_THREAD_SHOULD_PARK,
+};
+
struct io_sq_data {
refcount_t refs;
struct mutex lock;
struct wait_queue_head wait;
unsigned sq_thread_idle;
+ int sq_cpu;
+ pid_t task_pid;
+
+ unsigned long state;
+ struct completion startup;
+ struct completion completion;
+ struct completion exited;
};
#define IO_IOPOLL_BATCH 8
struct list_head locked_free_list;
};
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
struct io_submit_state {
struct blk_plug plug;
+ struct io_submit_link link;
/*
* io_kiocb alloc cache
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int sqo_dead: 1;
+ unsigned int sqo_exec: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
unsigned cached_cq_overflow;
unsigned long sq_check_overflow;
+ /* hashed buffered write serialization */
+ struct io_wq_hash *hash_map;
+
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
struct io_rings *rings;
- /* IO offload */
- struct io_wq *io_wq;
-
/*
- * For SQPOLL usage - we hold a reference to the parent task, so we
- * have access to the ->files
+ * For SQPOLL usage
*/
struct task_struct *sqo_task;
/* Only used for accounting purposes */
struct mm_struct *mm_account;
-#ifdef CONFIG_BLK_CGROUP
- struct cgroup_subsys_state *sqo_blkcg_css;
-#endif
-
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct user_struct *user;
- const struct cred *creds;
-
-#ifdef CONFIG_AUDIT
- kuid_t loginuid;
- unsigned int sessionid;
-#endif
-
struct completion ref_comp;
struct completion sq_thread_comp;
struct io_restriction restrictions;
+ /* exit task_work */
+ struct callback_head *exit_task_work;
+
+ struct wait_queue_head hash_wait;
+
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
};
unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
- unsigned work_flags;
};
static const struct io_op_def io_op_defs[] = {
.needs_async_data = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
.needs_async_data = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.pollin = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.pollout = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
- IO_WQ_WORK_MM,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_connect),
- .work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
- },
- [IORING_OP_OPENAT] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS | IO_WQ_WORK_MM,
- },
- [IORING_OP_CLOSE] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_FILES_UPDATE] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
- },
- [IORING_OP_STATX] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_OPENAT] = {},
+ [IORING_OP_CLOSE] = {},
+ [IORING_OP_FILES_UPDATE] = {},
+ [IORING_OP_STATX] = {},
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.buffer_select = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
.pollout = 1,
.plug = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_MADVISE] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
+ [IORING_OP_MADVISE] = {},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT2] = {
- .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
- .work_flags = IO_WQ_WORK_FILES,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
},
- [IORING_OP_RENAMEAT] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
- },
- [IORING_OP_UNLINKAT] = {
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
- IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
- },
+ [IORING_OP_RENAMEAT] = {},
+ [IORING_OP_UNLINKAT] = {},
};
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files);
+static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
struct io_ring_ctx *ctx);
-static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
- struct fixed_rsrc_ref_node *ref_node);
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static bool io_rw_reissue(struct io_kiocb *req);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
continue;
if (req->file && req->file->f_op == &io_uring_fops)
return true;
- if ((req->work.flags & IO_WQ_WORK_FILES) &&
- req->work.identity->files == files)
+ if (req->task->files == files)
return true;
}
return false;
}
-static void io_sq_thread_drop_mm_files(void)
-{
- struct files_struct *files = current->files;
- struct mm_struct *mm = current->mm;
-
- if (mm) {
- kthread_unuse_mm(mm);
- mmput(mm);
- current->mm = NULL;
- }
- if (files) {
- struct nsproxy *nsproxy = current->nsproxy;
-
- task_lock(current);
- current->files = NULL;
- current->nsproxy = NULL;
- task_unlock(current);
- put_files_struct(files);
- put_nsproxy(nsproxy);
- }
-}
-
-static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
-{
- if (!current->files) {
- struct files_struct *files;
- struct nsproxy *nsproxy;
-
- task_lock(ctx->sqo_task);
- files = ctx->sqo_task->files;
- if (!files) {
- task_unlock(ctx->sqo_task);
- return -EOWNERDEAD;
- }
- atomic_inc(&files->count);
- get_nsproxy(ctx->sqo_task->nsproxy);
- nsproxy = ctx->sqo_task->nsproxy;
- task_unlock(ctx->sqo_task);
-
- task_lock(current);
- current->files = files;
- current->nsproxy = nsproxy;
- task_unlock(current);
- }
- return 0;
-}
-
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
-{
- struct mm_struct *mm;
-
- if (current->mm)
- return 0;
-
- task_lock(ctx->sqo_task);
- mm = ctx->sqo_task->mm;
- if (unlikely(!mm || !mmget_not_zero(mm)))
- mm = NULL;
- task_unlock(ctx->sqo_task);
-
- if (mm) {
- kthread_use_mm(mm);
- return 0;
- }
-
- return -EFAULT;
-}
-
-static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- int ret;
-
- if (def->work_flags & IO_WQ_WORK_MM) {
- ret = __io_sq_thread_acquire_mm(ctx);
- if (unlikely(ret))
- return ret;
- }
-
- if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
- ret = __io_sq_thread_acquire_files(ctx);
- if (unlikely(ret))
- return ret;
- }
-
- return 0;
-}
-
-static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- if (!(ctx->flags & IORING_SETUP_SQPOLL))
- return 0;
- return __io_sq_thread_acquire_mm_files(ctx, req);
-}
-
-static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
- struct cgroup_subsys_state **cur_css)
-
-{
-#ifdef CONFIG_BLK_CGROUP
- /* puts the old one when swapping */
- if (*cur_css != ctx->sqo_blkcg_css) {
- kthread_associate_blkcg(ctx->sqo_blkcg_css);
- *cur_css = ctx->sqo_blkcg_css;
- }
-#endif
-}
-
-static void io_sq_thread_unassociate_blkcg(void)
-{
-#ifdef CONFIG_BLK_CGROUP
- kthread_associate_blkcg(NULL);
-#endif
-}
-
static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
}
-/*
- * None of these are dereferenced, they are simply used to check if any of
- * them have changed. If we're under current and check they are still the
- * same, we're fine to grab references to them for actual out-of-line use.
- */
-static void io_init_identity(struct io_identity *id)
-{
- id->files = current->files;
- id->mm = current->mm;
-#ifdef CONFIG_BLK_CGROUP
- rcu_read_lock();
- id->blkcg_css = blkcg_css();
- rcu_read_unlock();
-#endif
- id->creds = current_cred();
- id->nsproxy = current->nsproxy;
- id->fs = current->fs;
- id->fsize = rlimit(RLIMIT_FSIZE);
-#ifdef CONFIG_AUDIT
- id->loginuid = current->loginuid;
- id->sessionid = current->sessionid;
-#endif
- refcount_set(&id->count, 1);
-}
-
static inline void __io_req_init_async(struct io_kiocb *req)
{
memset(&req->work, 0, sizeof(req->work));
*/
static inline void io_req_init_async(struct io_kiocb *req)
{
- struct io_uring_task *tctx = current->io_uring;
-
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
__io_req_init_async(req);
-
- /* Grab a ref if this isn't our static identity */
- req->work.identity = tctx->identity;
- if (tctx->identity != &tctx->__identity)
- refcount_inc(&req->work.identity->count);
}
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
return false;
}
-static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
-{
- if (req->work.identity == &tctx->__identity)
- return;
- if (refcount_dec_and_test(&req->work.identity->count))
- kfree(req->work.identity);
-}
-
static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
- if (req->work.flags & IO_WQ_WORK_MM)
- mmdrop(req->work.identity->mm);
-#ifdef CONFIG_BLK_CGROUP
- if (req->work.flags & IO_WQ_WORK_BLKCG)
- css_put(req->work.identity->blkcg_css);
-#endif
- if (req->work.flags & IO_WQ_WORK_CREDS)
- put_cred(req->work.identity->creds);
- if (req->work.flags & IO_WQ_WORK_FS) {
- struct fs_struct *fs = req->work.identity->fs;
-
- spin_lock(&req->work.identity->fs->lock);
- if (--fs->users)
- fs = NULL;
- spin_unlock(&req->work.identity->fs->lock);
- if (fs)
- free_fs_struct(fs);
- }
- if (req->work.flags & IO_WQ_WORK_FILES) {
- put_files_struct(req->work.identity->files);
- put_nsproxy(req->work.identity->nsproxy);
+ if (req->work.creds) {
+ put_cred(req->work.creds);
+ req->work.creds = NULL;
}
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
}
req->flags &= ~REQ_F_WORK_INITIALIZED;
- req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
- IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
- io_put_identity(req->task->io_uring, req);
-}
-
-/*
- * Create a private copy of io_identity, since some fields don't match
- * the current context.
- */
-static bool io_identity_cow(struct io_kiocb *req)
-{
- struct io_uring_task *tctx = current->io_uring;
- const struct cred *creds = NULL;
- struct io_identity *id;
-
- if (req->work.flags & IO_WQ_WORK_CREDS)
- creds = req->work.identity->creds;
-
- id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
- if (unlikely(!id)) {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- return false;
- }
-
- /*
- * We can safely just re-init the creds we copied Either the field
- * matches the current one, or we haven't grabbed it yet. The only
- * exception is ->creds, through registered personalities, so handle
- * that one separately.
- */
- io_init_identity(id);
- if (creds)
- id->creds = creds;
-
- /* add one for this request */
- refcount_inc(&id->count);
-
- /* drop tctx and req identity references, if needed */
- if (tctx->identity != &tctx->__identity &&
- refcount_dec_and_test(&tctx->identity->count))
- kfree(tctx->identity);
- if (req->work.identity != &tctx->__identity &&
- refcount_dec_and_test(&req->work.identity->count))
- kfree(req->work.identity);
-
- req->work.identity = id;
- tctx->identity = id;
- return true;
}
static void io_req_track_inflight(struct io_kiocb *req)
}
}
-static bool io_grab_identity(struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_identity *id = req->work.identity;
-
- if (def->work_flags & IO_WQ_WORK_FSIZE) {
- if (id->fsize != rlimit(RLIMIT_FSIZE))
- return false;
- req->work.flags |= IO_WQ_WORK_FSIZE;
- }
-#ifdef CONFIG_BLK_CGROUP
- if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
- (def->work_flags & IO_WQ_WORK_BLKCG)) {
- rcu_read_lock();
- if (id->blkcg_css != blkcg_css()) {
- rcu_read_unlock();
- return false;
- }
- /*
- * This should be rare, either the cgroup is dying or the task
- * is moving cgroups. Just punt to root for the handful of ios.
- */
- if (css_tryget_online(id->blkcg_css))
- req->work.flags |= IO_WQ_WORK_BLKCG;
- rcu_read_unlock();
- }
-#endif
- if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
- if (id->creds != current_cred())
- return false;
- get_cred(id->creds);
- req->work.flags |= IO_WQ_WORK_CREDS;
- }
-#ifdef CONFIG_AUDIT
- if (!uid_eq(current->loginuid, id->loginuid) ||
- current->sessionid != id->sessionid)
- return false;
-#endif
- if (!(req->work.flags & IO_WQ_WORK_FS) &&
- (def->work_flags & IO_WQ_WORK_FS)) {
- if (current->fs != id->fs)
- return false;
- spin_lock(&id->fs->lock);
- if (!id->fs->in_exec) {
- id->fs->users++;
- req->work.flags |= IO_WQ_WORK_FS;
- } else {
- req->work.flags |= IO_WQ_WORK_CANCEL;
- }
- spin_unlock(¤t->fs->lock);
- }
- if (!(req->work.flags & IO_WQ_WORK_FILES) &&
- (def->work_flags & IO_WQ_WORK_FILES) &&
- !(req->flags & REQ_F_NO_FILE_TABLE)) {
- if (id->files != current->files ||
- id->nsproxy != current->nsproxy)
- return false;
- atomic_inc(&id->files->count);
- get_nsproxy(id->nsproxy);
- req->work.flags |= IO_WQ_WORK_FILES;
- io_req_track_inflight(req);
- }
- if (!(req->work.flags & IO_WQ_WORK_MM) &&
- (def->work_flags & IO_WQ_WORK_MM)) {
- if (id->mm != current->mm)
- return false;
- mmgrab(id->mm);
- req->work.flags |= IO_WQ_WORK_MM;
- }
-
- return true;
-}
-
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
-
- /* if we fail grabbing identity, we must COW, regrab, and retry */
- if (io_grab_identity(req))
- return;
-
- if (!io_identity_cow(req))
- return;
-
- /* can't fail at this point */
- if (!io_grab_identity(req))
- WARN_ON(1);
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
}
static void io_prep_async_link(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ BUG_ON(!tctx);
+ BUG_ON(!tctx->io_wq);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
- io_wq_enqueue(ctx->io_wq, &req->work);
+ io_wq_enqueue(tctx->io_wq, &req->work);
return link;
}
static void io_req_task_work_add_fallback(struct io_kiocb *req,
task_work_func_t cb)
{
- struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct callback_head *head;
init_task_work(&req->task_work, cb);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
+ do {
+ head = READ_ONCE(ctx->exit_task_work);
+ req->task_work.next = head;
+ } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
}
static void __io_req_task_cancel(struct io_kiocb *req, int error)
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
- __io_req_task_cancel(req, -ECANCELED);
+ mutex_lock(&ctx->uring_lock);
+ __io_req_task_cancel(req, req->result);
+ mutex_unlock(&ctx->uring_lock);
percpu_ref_put(&ctx->refs);
}
/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
mutex_lock(&ctx->uring_lock);
- if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
- !io_sq_thread_acquire_mm_files(ctx, req))
+ if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
__io_queue_sqe(req);
else
__io_req_task_cancel(req, -EFAULT);
mutex_unlock(&ctx->uring_lock);
-
- if (ctx->flags & IORING_SETUP_SQPOLL)
- io_sq_thread_drop_mm_files();
}
static void io_req_task_submit(struct callback_head *cb)
req->task_work.func = io_req_task_submit;
ret = io_req_task_work_add(req);
if (unlikely(ret)) {
+ req->result = -ECANCELED;
percpu_ref_get(&req->ctx->refs);
io_req_task_work_add_fallback(req, io_req_task_cancel);
}
}
+static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+{
+ percpu_ref_get(&req->ctx->refs);
+ req->result = ret;
+ req->task_work.func = io_req_task_cancel;
+
+ if (unlikely(io_req_task_work_add(req)))
+ io_req_task_work_add_fallback(req, io_req_task_cancel);
+}
+
static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
{
#ifdef CONFIG_BLOCK
umode_t mode = file_inode(req->file)->i_mode;
- int ret;
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
return false;
+ /*
+ * If ref is dying, we might be running poll reap from the exit work.
+ * Don't attempt to reissue from that path, just let it fail with
+ * -EAGAIN.
+ */
+ if (percpu_ref_is_dying(&req->ctx->refs))
+ return false;
lockdep_assert_held(&req->ctx->uring_lock);
- ret = io_sq_thread_acquire_mm_files(req->ctx, req);
-
- if (!ret && io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req)) {
refcount_inc(&req->refs);
io_queue_async_work(req);
return true;
static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- ssize_t ret;
-
- ret = io_prep_rw(req, sqe);
- if (ret)
- return ret;
-
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
-
- /* either don't need iovec imported or already have it */
- if (!req->async_data)
- return 0;
- return io_rw_prep_async(req, READ);
+ return io_prep_rw(req, sqe);
}
/*
ret = io_iter_do_read(req, iter);
if (ret == -EIOCBQUEUED) {
- /* it's faster to check here then delegate to kfree */
- if (iovec)
- kfree(iovec);
- return 0;
+ goto out_free;
} else if (ret == -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
if (ret2)
return ret2;
+ iovec = NULL;
rw = req->async_data;
/* now use our persistent iterator, if we aren't already */
iter = &rw->iter;
} while (ret > 0 && ret < io_size);
done:
kiocb_done(kiocb, ret, issue_flags);
+out_free:
+ /* it's faster to check here then delegate to kfree */
+ if (iovec)
+ kfree(iovec);
return 0;
}
static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- ssize_t ret;
-
- ret = io_prep_rw(req, sqe);
- if (ret)
- return ret;
-
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
-
- /* either don't need iovec imported or already have it */
- if (!req->async_data)
- return 0;
- return io_rw_prep_async(req, WRITE);
+ return io_prep_rw(req, sqe);
}
static int io_write(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
-static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
return 0;
}
-static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->file)
- return -EBADF;
-
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
req->sr_msg.msg_flags, &iomsg->free_iov);
}
+static int io_sendmsg_prep_async(struct io_kiocb *req)
+{
+ int ret;
+
+ if (!io_op_defs[req->opcode].needs_async_data)
+ return 0;
+ ret = io_sendmsg_copy_hdr(req, req->async_data);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
+}
+
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_async_msghdr *async_msg = req->async_data;
struct io_sr_msg *sr = &req->sr_msg;
- int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
-
- if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
- return 0;
- ret = io_sendmsg_copy_hdr(req, async_msg);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
+ return 0;
}
static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_put_kbuf(req, req->sr_msg.kbuf);
}
-static int io_recvmsg_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_recvmsg_prep_async(struct io_kiocb *req)
{
- struct io_async_msghdr *async_msg = req->async_data;
- struct io_sr_msg *sr = &req->sr_msg;
int ret;
+ if (!io_op_defs[req->opcode].needs_async_data)
+ return 0;
+ ret = io_recvmsg_copy_hdr(req, req->async_data);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
+}
+
+static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sr_msg *sr = &req->sr_msg;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
-
- if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
- return 0;
- ret = io_recvmsg_copy_hdr(req, async_msg);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
+ return 0;
}
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_connect_prep_async(struct io_kiocb *req)
+{
+ struct io_async_connect *io = req->async_data;
+ struct io_connect *conn = &req->connect;
+
+ return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
+}
+
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- struct io_async_connect *io = req->async_data;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
conn->addr_len = READ_ONCE(sqe->addr2);
-
- if (!io)
- return 0;
-
- return move_addr_to_kernel(conn->addr, conn->addr_len,
- &io->address);
+ return 0;
}
static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
#else /* !CONFIG_NET */
-static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
+#define IO_NETOP_FN(op) \
+static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
+{ \
+ return -EOPNOTSUPP; \
+}
+
+#define IO_NETOP_PREP(op) \
+IO_NETOP_FN(op) \
+static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
+{ \
+ return -EOPNOTSUPP; \
+} \
+
+#define IO_NETOP_PREP_ASYNC(op) \
+IO_NETOP_PREP(op) \
+static int io_##op##_prep_async(struct io_kiocb *req) \
+{ \
+ return -EOPNOTSUPP; \
+}
+
+IO_NETOP_PREP_ASYNC(sendmsg);
+IO_NETOP_PREP_ASYNC(recvmsg);
+IO_NETOP_PREP_ASYNC(connect);
+IO_NETOP_PREP(accept);
+IO_NETOP_FN(send);
+IO_NETOP_FN(recv);
+#endif /* CONFIG_NET */
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
-static int io_send(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
+ __poll_t mask, task_work_func_t func)
{
- return -EOPNOTSUPP;
-}
+ int ret;
-static int io_recvmsg_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- return -EOPNOTSUPP;
-}
-
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
-{
- return -EOPNOTSUPP;
-}
-#endif /* CONFIG_NET */
-
-struct io_poll_table {
- struct poll_table_struct pt;
- struct io_kiocb *req;
- int error;
-};
-
-static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
- __poll_t mask, task_work_func_t func)
-{
- int ret;
-
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
-
- trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+ trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
list_del_init(&poll->wait.entry);
return req->user_data == (unsigned long) data;
}
-static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
{
enum io_wq_cancel cancel_ret;
int ret = 0;
- cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+ if (!tctx->io_wq)
+ return -ENOENT;
+
+ cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
unsigned long flags;
int ret;
- ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+ ret = io_async_cancel_one(req->task->io_uring,
+ (void *) (unsigned long) sqe_addr);
if (ret != -ENOENT) {
spin_lock_irqsave(&ctx->completion_lock, flags);
goto done;
case IORING_OP_POLL_REMOVE:
return io_poll_remove_prep(req, sqe);
case IORING_OP_FSYNC:
- return io_prep_fsync(req, sqe);
+ return io_fsync_prep(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
- return io_prep_sfr(req, sqe);
+ return io_sfr_prep(req, sqe);
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
return io_sendmsg_prep(req, sqe);
return-EINVAL;
}
-static int io_req_defer_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_req_prep_async(struct io_kiocb *req)
+{
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ return io_rw_prep_async(req, READ);
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ return io_rw_prep_async(req, WRITE);
+ case IORING_OP_SENDMSG:
+ case IORING_OP_SEND:
+ return io_sendmsg_prep_async(req);
+ case IORING_OP_RECVMSG:
+ case IORING_OP_RECV:
+ return io_recvmsg_prep_async(req);
+ case IORING_OP_CONNECT:
+ return io_connect_prep_async(req);
+ }
+ return 0;
+}
+
+static int io_req_defer_prep(struct io_kiocb *req)
{
- if (!sqe)
+ if (!io_op_defs[req->opcode].needs_async_data)
+ return 0;
+ /* some opcodes init it during the inital prep */
+ if (req->async_data)
return 0;
- if (io_alloc_async_data(req))
+ if (__io_alloc_async_data(req))
return -EAGAIN;
- return io_req_prep(req, sqe);
+ return io_req_prep_async(req);
}
static u32 io_get_sequence(struct io_kiocb *req)
return total_submitted - nr_reqs;
}
-static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_req_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
return 0;
- if (!req->async_data) {
- ret = io_req_defer_prep(req, sqe);
- if (ret)
- return ret;
- }
+ ret = io_req_defer_prep(req);
+ if (ret)
+ return ret;
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de)
} while (1);
}
+ /* avoid locking problems by failing it from a clean context */
if (ret) {
- struct io_ring_ctx *lock_ctx = NULL;
-
- if (req->ctx->flags & IORING_SETUP_IOPOLL)
- lock_ctx = req->ctx;
-
- /*
- * io_iopoll_complete() does not hold completion_lock to
- * complete polled io, so here for polled io, we can not call
- * io_req_complete() directly, otherwise there maybe concurrent
- * access to cqring, defer_list, etc, which is not safe. Given
- * that io_iopoll_complete() is always called under uring_lock,
- * so here for polled io, we also get uring_lock to complete
- * it.
- */
- if (lock_ctx)
- mutex_lock(&lock_ctx->uring_lock);
-
- req_set_fail_links(req);
- io_req_complete(req, ret);
-
- if (lock_ctx)
- mutex_unlock(&lock_ctx->uring_lock);
+ /* io-wq is going to take one down */
+ refcount_inc(&req->refs);
+ io_req_task_queue_fail(req, ret);
}
}
const struct cred *old_creds = NULL;
int ret;
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_CREDS) &&
- req->work.identity->creds != current_cred())
- old_creds = override_creds(req->work.identity->creds);
+ if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+ req->work.creds != current_cred())
+ old_creds = override_creds(req->work.creds);
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
io_queue_linked_timeout(linked_timeout);
}
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_queue_sqe(struct io_kiocb *req)
{
int ret;
- ret = io_req_defer(req, sqe);
+ ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
fail_req:
io_req_complete(req, ret);
}
} else if (req->flags & REQ_F_FORCE_ASYNC) {
- if (!req->async_data) {
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
- }
+ ret = io_req_defer_prep(req);
+ if (unlikely(ret))
+ goto fail_req;
io_queue_async_work(req);
} else {
- if (sqe) {
- ret = io_req_prep(req, sqe);
- if (unlikely(ret))
- goto fail_req;
- }
__io_queue_sqe(req);
}
}
-static inline void io_queue_link_head(struct io_kiocb *req)
+/*
+ * Check SQE restrictions (opcode and flags).
+ *
+ * Returns 'true' if SQE is allowed, 'false' otherwise.
+ */
+static inline bool io_check_restriction(struct io_ring_ctx *ctx,
+ struct io_kiocb *req,
+ unsigned int sqe_flags)
{
- if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
- io_put_req(req);
- io_req_complete(req, -ECANCELED);
- } else
- io_queue_sqe(req, NULL);
+ if (!ctx->restricted)
+ return true;
+
+ if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
+ return false;
+
+ if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
+ ctx->restrictions.sqe_flags_required)
+ return false;
+
+ if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
+ ctx->restrictions.sqe_flags_required))
+ return false;
+
+ return true;
}
-struct io_submit_link {
- struct io_kiocb *head;
- struct io_kiocb *last;
-};
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_submit_state *state;
+ unsigned int sqe_flags;
+ int id, ret = 0;
+
+ req->opcode = READ_ONCE(sqe->opcode);
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags = sqe_flags = READ_ONCE(sqe->flags);
+ req->user_data = READ_ONCE(sqe->user_data);
+ req->async_data = NULL;
+ req->file = NULL;
+ req->ctx = ctx;
+ req->link = NULL;
+ req->fixed_rsrc_refs = NULL;
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = current;
+ req->result = 0;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
+ req->flags = 0;
+ return -EINVAL;
+ }
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+ return -EINVAL;
+
+ if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+ return -EACCES;
+
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select)
+ return -EOPNOTSUPP;
+
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ __io_req_init_async(req);
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
+ return -EINVAL;
+ get_cred(req->work.creds);
+ }
+
+ state = &ctx->submit_state;
+
+ /*
+ * Plug now if we have more than 1 IO left after this, and the target
+ * is potentially a read/write to block based storage.
+ */
+ if (!state->plug_started && state->ios_left > 1 &&
+ io_op_defs[req->opcode].plug) {
+ blk_start_plug(&state->plug);
+ state->plug_started = true;
+ }
+
+ if (io_op_defs[req->opcode].needs_file) {
+ bool fixed = req->flags & REQ_F_FIXED_FILE;
-static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_submit_link *link)
+ req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ if (unlikely(!req->file))
+ ret = -EBADF;
+ }
+
+ state->ios_left--;
+ return ret;
+}
+
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
+ struct io_submit_link *link = &ctx->submit_state.link;
int ret;
+ ret = io_init_req(ctx, req, sqe);
+ if (unlikely(ret)) {
+fail_req:
+ io_put_req(req);
+ io_req_complete(req, ret);
+ if (link->head) {
+ /* fail even hard links since we don't submit */
+ link->head->flags |= REQ_F_FAIL_LINK;
+ io_put_req(link->head);
+ io_req_complete(link->head, -ECANCELED);
+ link->head = NULL;
+ }
+ return ret;
+ }
+ ret = io_req_prep(req, sqe);
+ if (unlikely(ret))
+ goto fail_req;
+
+ /* don't need @sqe from now on */
+ trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
+ true, ctx->flags & IORING_SETUP_SQPOLL);
+
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret)) {
- /* fail even hard links since we don't submit */
- head->flags |= REQ_F_FAIL_LINK;
- return ret;
- }
+ ret = io_req_defer_prep(req);
+ if (unlikely(ret))
+ goto fail_req;
trace_io_uring_link(ctx, req, head);
link->last->link = req;
link->last = req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_link_head(head);
+ io_queue_sqe(head);
link->head = NULL;
}
} else {
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- ret = io_req_defer_prep(req, sqe);
- if (unlikely(ret))
- req->flags |= REQ_F_FAIL_LINK;
link->head = req;
link->last = req;
} else {
- io_queue_sqe(req, sqe);
+ io_queue_sqe(req);
}
}
static void io_submit_state_end(struct io_submit_state *state,
struct io_ring_ctx *ctx)
{
+ if (state->link.head)
+ io_queue_sqe(state->link.head);
if (state->comp.nr)
io_submit_flush_completions(&state->comp, ctx);
if (state->plug_started)
{
state->plug_started = false;
state->ios_left = max_ios;
+ /* set only head, no need to init link_last in advance */
+ state->link.head = NULL;
}
static void io_commit_sqring(struct io_ring_ctx *ctx)
return NULL;
}
-/*
- * Check SQE restrictions (opcode and flags).
- *
- * Returns 'true' if SQE is allowed, 'false' otherwise.
- */
-static inline bool io_check_restriction(struct io_ring_ctx *ctx,
- struct io_kiocb *req,
- unsigned int sqe_flags)
-{
- if (!ctx->restricted)
- return true;
-
- if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
- return false;
-
- if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
- ctx->restrictions.sqe_flags_required)
- return false;
-
- if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
- ctx->restrictions.sqe_flags_required))
- return false;
-
- return true;
-}
-
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
-
-static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_submit_state *state;
- unsigned int sqe_flags;
- int id, ret = 0;
-
- req->opcode = READ_ONCE(sqe->opcode);
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags = sqe_flags = READ_ONCE(sqe->flags);
- req->user_data = READ_ONCE(sqe->user_data);
- req->async_data = NULL;
- req->file = NULL;
- req->ctx = ctx;
- req->link = NULL;
- req->fixed_rsrc_refs = NULL;
- /* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
- req->task = current;
- req->result = 0;
-
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
-
- if (unlikely(req->opcode >= IORING_OP_LAST))
- return -EINVAL;
-
- if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
- return -EFAULT;
-
- if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
- return -EACCES;
-
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select)
- return -EOPNOTSUPP;
-
- id = READ_ONCE(sqe->personality);
- if (id) {
- struct io_identity *iod;
-
- iod = idr_find(&ctx->personality_idr, id);
- if (unlikely(!iod))
- return -EINVAL;
- refcount_inc(&iod->count);
-
- __io_req_init_async(req);
- get_cred(iod->creds);
- req->work.identity = iod;
- req->work.flags |= IO_WQ_WORK_CREDS;
- }
-
- state = &ctx->submit_state;
-
- /*
- * Plug now if we have more than 1 IO left after this, and the target
- * is potentially a read/write to block based storage.
- */
- if (!state->plug_started && state->ios_left > 1 &&
- io_op_defs[req->opcode].plug) {
- blk_start_plug(&state->plug);
- state->plug_started = true;
- }
-
- if (io_op_defs[req->opcode].needs_file) {
- bool fixed = req->flags & REQ_F_FIXED_FILE;
-
- req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
- if (unlikely(!req->file))
- ret = -EBADF;
- }
-
- state->ios_left--;
- return ret;
-}
-
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
- struct io_submit_link link;
- int i, submitted = 0;
+ int submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
percpu_counter_add(¤t->io_uring->inflight, nr);
refcount_add(nr, ¤t->usage);
-
io_submit_state_start(&ctx->submit_state, nr);
- link.head = NULL;
- for (i = 0; i < nr; i++) {
+ while (submitted < nr) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- int err;
req = io_alloc_req(ctx);
if (unlikely(!req)) {
}
/* will complete beyond this point, count as submitted */
submitted++;
-
- err = io_init_req(ctx, req, sqe);
- if (unlikely(err)) {
-fail_req:
- io_put_req(req);
- io_req_complete(req, err);
+ if (io_submit_sqe(ctx, req, sqe))
break;
- }
-
- trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, ctx->flags & IORING_SETUP_SQPOLL);
- err = io_submit_sqe(req, sqe, &link);
- if (err)
- goto fail_req;
}
if (unlikely(submitted != nr)) {
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}
- if (link.head)
- io_queue_link_head(link.head);
- io_submit_state_end(&ctx->submit_state, ctx);
+ io_submit_state_end(&ctx->submit_state, ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
io_sqd_update_thread_idle(sqd);
}
+static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
+{
+ return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+}
+
+static bool io_sq_thread_should_park(struct io_sq_data *sqd)
+{
+ return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+}
+
+static void io_sq_thread_parkme(struct io_sq_data *sqd)
+{
+ for (;;) {
+ /*
+ * TASK_PARKED is a special state; we must serialize against
+ * possible pending wakeups to avoid store-store collisions on
+ * task->state.
+ *
+ * Such a collision might possibly result in the task state
+ * changin from TASK_PARKED and us failing the
+ * wait_task_inactive() in kthread_park().
+ */
+ set_special_state(TASK_PARKED);
+ if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
+ break;
+
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the caller of kthread_park() may spend more time in
+ * wait_task_inactive().
+ */
+ preempt_disable();
+ complete(&sqd->completion);
+ schedule_preempt_disabled();
+ preempt_enable();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
static int io_sq_thread(void *data)
{
- struct cgroup_subsys_state *cur_css = NULL;
- struct files_struct *old_files = current->files;
- struct nsproxy *old_nsproxy = current->nsproxy;
- const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
unsigned long timeout = 0;
+ char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
- task_lock(current);
- current->files = NULL;
- current->nsproxy = NULL;
- task_unlock(current);
+ sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+ set_task_comm(current, buf);
+ sqd->thread = current;
+ current->pf_io_worker = NULL;
+
+ if (sqd->sq_cpu != -1)
+ set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
+ else
+ set_cpus_allowed_ptr(current, cpu_online_mask);
+ current->flags |= PF_NO_SETAFFINITY;
+
+ complete(&sqd->completion);
+
+ wait_for_completion(&sqd->startup);
- while (!kthread_should_stop()) {
+ while (!io_sq_thread_should_stop(sqd)) {
int ret;
bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
- * kthread parking. This synchronizes the thread vs users,
+ * thread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park()) {
- kthread_parkme();
- /*
- * When sq thread is unparked, in case the previous park operation
- * comes from io_put_sq_data(), which means that sq thread is going
- * to be stopped, so here needs to have a check.
- */
- if (kthread_should_stop())
- break;
+ if (io_sq_thread_should_park(sqd)) {
+ io_sq_thread_parkme(sqd);
+ continue;
}
-
if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
timeout = jiffies + sqd->sq_thread_idle;
}
-
+ if (fatal_signal_pending(current))
+ break;
sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- if (current->cred != ctx->creds) {
- if (old_cred)
- revert_creds(old_cred);
- old_cred = override_creds(ctx->creds);
- }
- io_sq_thread_associate_blkcg(ctx, &cur_css);
-#ifdef CONFIG_AUDIT
- current->loginuid = ctx->loginuid;
- current->sessionid = ctx->sessionid;
-#endif
-
ret = __io_sq_thread(ctx, cap_entries);
if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
sqt_spin = true;
-
- io_sq_thread_drop_mm_files();
}
if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
- io_sq_thread_drop_mm_files();
cond_resched();
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
}
}
- if (needs_sched && !kthread_should_park()) {
+ if (needs_sched && !io_sq_thread_should_park(sqd)) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
timeout = jiffies + sqd->sq_thread_idle;
}
- io_run_task_work();
- io_sq_thread_drop_mm_files();
-
- if (cur_css)
- io_sq_thread_unassociate_blkcg();
- if (old_cred)
- revert_creds(old_cred);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_cancel_sqpoll(ctx);
- task_lock(current);
- current->files = old_files;
- current->nsproxy = old_nsproxy;
- task_unlock(current);
+ io_run_task_work();
- kthread_parkme();
+ /*
+ * Clear thread under lock so that concurrent parks work correctly
+ */
+ complete_all(&sqd->completion);
+ mutex_lock(&sqd->lock);
+ sqd->thread = NULL;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ ctx->sqo_exec = 1;
+ io_ring_set_wakeup_flag(ctx);
+ }
+ mutex_unlock(&sqd->lock);
- return 0;
+ complete(&sqd->exited);
+ do_exit(0);
}
struct io_wait_queue {
percpu_ref_get(&rsrc_data->refs);
}
-static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
- struct io_ring_ctx *ctx,
- struct fixed_rsrc_ref_node *backup_node)
+static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
{
- struct fixed_rsrc_ref_node *ref_node;
- int ret;
+ struct fixed_rsrc_ref_node *ref_node = NULL;
io_rsrc_ref_lock(ctx);
ref_node = data->node;
+ data->node = NULL;
io_rsrc_ref_unlock(ctx);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
+}
+
+static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
+ struct io_ring_ctx *ctx,
+ void (*rsrc_put)(struct io_ring_ctx *ctx,
+ struct io_rsrc_put *prsrc))
+{
+ struct fixed_rsrc_ref_node *backup_node;
+ int ret;
- percpu_ref_kill(&data->refs);
+ if (data->quiesce)
+ return -ENXIO;
- /* wait for all refs nodes to complete */
- flush_delayed_work(&ctx->rsrc_put_work);
+ data->quiesce = true;
do {
+ ret = -ENOMEM;
+ backup_node = alloc_fixed_rsrc_ref_node(ctx);
+ if (!backup_node)
+ break;
+ backup_node->rsrc_data = data;
+ backup_node->rsrc_put = rsrc_put;
+
+ io_sqe_rsrc_kill_node(ctx, data);
+ percpu_ref_kill(&data->refs);
+ flush_delayed_work(&ctx->rsrc_put_work);
+
ret = wait_for_completion_interruptible(&data->done);
if (!ret)
break;
+
+ percpu_ref_resurrect(&data->refs);
+ io_sqe_rsrc_set_node(ctx, data, backup_node);
+ backup_node = NULL;
+ reinit_completion(&data->done);
+ mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
- if (ret < 0) {
- percpu_ref_resurrect(&data->refs);
- reinit_completion(&data->done);
- io_sqe_rsrc_set_node(ctx, data, backup_node);
- return ret;
- }
- } while (1);
+ mutex_lock(&ctx->uring_lock);
+ } while (ret >= 0);
+ data->quiesce = false;
- destroy_fixed_rsrc_ref_node(backup_node);
- return 0;
+ if (backup_node)
+ destroy_fixed_rsrc_ref_node(backup_node);
+ return ret;
}
static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_rsrc_data *data = ctx->file_data;
- struct fixed_rsrc_ref_node *backup_node;
unsigned nr_tables, i;
int ret;
- if (!data)
+ /*
+ * percpu_ref_is_dying() is to stop parallel files unregister
+ * Since we possibly drop uring lock later in this function to
+ * run task work.
+ */
+ if (!data || percpu_ref_is_dying(&data->refs))
return -ENXIO;
- backup_node = alloc_fixed_rsrc_ref_node(ctx);
- if (!backup_node)
- return -ENOMEM;
- init_fixed_file_ref_node(ctx, backup_node);
-
- ret = io_rsrc_ref_quiesce(data, ctx, backup_node);
+ ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
if (ret)
return ret;
return 0;
}
+static void io_sq_thread_unpark(struct io_sq_data *sqd)
+ __releases(&sqd->lock)
+{
+ if (!sqd->thread)
+ return;
+ if (sqd->thread == current)
+ return;
+ clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ wake_up_state(sqd->thread, TASK_PARKED);
+ mutex_unlock(&sqd->lock);
+}
+
+static bool io_sq_thread_park(struct io_sq_data *sqd)
+ __acquires(&sqd->lock)
+{
+ if (sqd->thread == current)
+ return true;
+ mutex_lock(&sqd->lock);
+ if (!sqd->thread) {
+ mutex_unlock(&sqd->lock);
+ return false;
+ }
+ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+ wake_up_process(sqd->thread);
+ wait_for_completion(&sqd->completion);
+ return true;
+}
+
+static void io_sq_thread_stop(struct io_sq_data *sqd)
+{
+ if (!sqd->thread)
+ return;
+
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+ wake_up_process(sqd->thread);
+ wait_for_completion(&sqd->exited);
+}
+
static void io_put_sq_data(struct io_sq_data *sqd)
{
if (refcount_dec_and_test(&sqd->refs)) {
- /*
- * The park is a bit of a work-around, without it we get
- * warning spews on shutdown with SQPOLL set and affinity
- * set to a single CPU.
- */
+ io_sq_thread_stop(sqd);
+ kfree(sqd);
+ }
+}
+
+static void io_sq_thread_finish(struct io_ring_ctx *ctx)
+{
+ struct io_sq_data *sqd = ctx->sq_data;
+
+ if (sqd) {
+ complete(&sqd->startup);
if (sqd->thread) {
- kthread_park(sqd->thread);
- kthread_stop(sqd->thread);
+ wait_for_completion(&ctx->sq_thread_comp);
+ io_sq_thread_park(sqd);
}
- kfree(sqd);
+ mutex_lock(&sqd->ctx_lock);
+ list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
+ mutex_unlock(&sqd->ctx_lock);
+
+ if (sqd->thread)
+ io_sq_thread_unpark(sqd);
+
+ io_put_sq_data(sqd);
+ ctx->sq_data = NULL;
}
}
mutex_init(&sqd->ctx_lock);
mutex_init(&sqd->lock);
init_waitqueue_head(&sqd->wait);
+ init_completion(&sqd->startup);
+ init_completion(&sqd->completion);
+ init_completion(&sqd->exited);
return sqd;
}
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
- __releases(&sqd->lock)
-{
- if (!sqd->thread)
- return;
- kthread_unpark(sqd->thread);
- mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
- __acquires(&sqd->lock)
-{
- if (!sqd->thread)
- return;
- mutex_lock(&sqd->lock);
- kthread_park(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd) {
- if (sqd->thread) {
- /*
- * We may arrive here from the error branch in
- * io_sq_offload_create() where the kthread is created
- * without being waked up, thus wake it up now to make
- * sure the wait will complete.
- */
- wake_up_process(sqd->thread);
- wait_for_completion(&ctx->sq_thread_comp);
-
- io_sq_thread_park(sqd);
- }
-
- mutex_lock(&sqd->ctx_lock);
- list_del(&ctx->sqd_list);
- io_sqd_update_thread_idle(sqd);
- mutex_unlock(&sqd->ctx_lock);
-
- if (sqd->thread)
- io_sq_thread_unpark(sqd);
-
- io_put_sq_data(sqd);
- ctx->sq_data = NULL;
- }
-}
-
-static void io_finish_async(struct io_ring_ctx *ctx)
-{
- io_sq_thread_stop(ctx);
-
- if (ctx->io_wq) {
- io_wq_destroy(ctx->io_wq);
- ctx->io_wq = NULL;
- }
-}
-
#if defined(CONFIG_UNIX)
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
skb->sk = sk;
nr_files = 0;
- fpl->user = get_uid(ctx->user);
+ fpl->user = get_uid(current_user());
for (i = 0; i < nr; i++) {
struct file *file = io_file_from_index(ctx, i + offset);
return req ? &req->work : NULL;
}
-static int io_init_wq_offload(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
{
+ struct io_wq_hash *hash;
struct io_wq_data data;
- struct fd f;
- struct io_ring_ctx *ctx_attach;
unsigned int concurrency;
- int ret = 0;
-
- data.user = ctx->user;
- data.free_work = io_free_work;
- data.do_work = io_wq_submit_work;
- if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-
- ctx->io_wq = io_wq_create(concurrency, &data);
- if (IS_ERR(ctx->io_wq)) {
- ret = PTR_ERR(ctx->io_wq);
- ctx->io_wq = NULL;
- }
- return ret;
+ hash = ctx->hash_map;
+ if (!hash) {
+ hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+ if (!hash)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&hash->refs, 1);
+ init_waitqueue_head(&hash->wait);
+ ctx->hash_map = hash;
}
- f = fdget(p->wq_fd);
- if (!f.file)
- return -EBADF;
-
- if (f.file->f_op != &io_uring_fops) {
- ret = -EINVAL;
- goto out_fput;
- }
+ data.hash = hash;
+ data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;
- ctx_attach = f.file->private_data;
- /* @io_wq is protected by holding the fd */
- if (!io_wq_get(ctx_attach->io_wq, &data)) {
- ret = -EINVAL;
- goto out_fput;
- }
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = ctx_attach->io_wq;
-out_fput:
- fdput(f);
- return ret;
+ return io_wq_create(concurrency, &data);
}
-static int io_uring_alloc_task_context(struct task_struct *task)
+static int io_uring_alloc_task_context(struct task_struct *task,
+ struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
return ret;
}
+ tctx->io_wq = io_init_wq_offload(ctx);
+ if (IS_ERR(tctx->io_wq)) {
+ ret = PTR_ERR(tctx->io_wq);
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ return ret;
+ }
+
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
tctx->sqpoll = false;
- io_init_identity(&tctx->__identity);
- tctx->identity = &tctx->__identity;
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
struct io_uring_task *tctx = tsk->io_uring;
WARN_ON_ONCE(!xa_empty(&tctx->xa));
- WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
- if (tctx->identity != &tctx->__identity)
- kfree(tctx->identity);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
+static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
+{
+ int ret;
+
+ clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ reinit_completion(&sqd->completion);
+ ctx->sqo_dead = ctx->sqo_exec = 0;
+ sqd->task_pid = current->pid;
+ current->flags |= PF_IO_WORKER;
+ ret = io_wq_fork_thread(io_sq_thread, sqd);
+ current->flags &= ~PF_IO_WORKER;
+ if (ret < 0) {
+ sqd->thread = NULL;
+ return ret;
+ }
+ wait_for_completion(&sqd->completion);
+ return io_uring_alloc_task_context(sqd->thread, ctx);
+}
+
static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
int ret;
+ /* Retain compatibility with failing for an invalid attach attempt */
+ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+ IORING_SETUP_ATTACH_WQ) {
+ struct fd f;
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -ENXIO;
+ if (f.file->f_op != &io_uring_fops) {
+ fdput(f);
+ return -EINVAL;
+ }
+ fdput(f);
+ }
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sqd;
ctx->sq_thread_idle = HZ;
if (sqd->thread)
- goto done;
+ return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
if (!cpu_online(cpu))
goto err;
- sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
- cpu, "io_uring-sq");
+ sqd->sq_cpu = cpu;
} else {
- sqd->thread = kthread_create(io_sq_thread, sqd,
- "io_uring-sq");
+ sqd->sq_cpu = -1;
}
- if (IS_ERR(sqd->thread)) {
- ret = PTR_ERR(sqd->thread);
+
+ sqd->task_pid = current->pid;
+ current->flags |= PF_IO_WORKER;
+ ret = io_wq_fork_thread(io_sq_thread, sqd);
+ current->flags &= ~PF_IO_WORKER;
+ if (ret < 0) {
sqd->thread = NULL;
goto err;
}
- ret = io_uring_alloc_task_context(sqd->thread);
+ wait_for_completion(&sqd->completion);
+ ret = io_uring_alloc_task_context(sqd->thread, ctx);
if (ret)
goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
goto err;
}
-done:
- ret = io_init_wq_offload(ctx, p);
- if (ret)
- goto err;
-
return 0;
err:
- io_finish_async(ctx);
+ io_sq_thread_finish(ctx);
return ret;
}
{
struct io_sq_data *sqd = ctx->sq_data;
- if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
- wake_up_process(sqd->thread);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ complete(&sqd->startup);
}
static inline void __io_unaccount_mem(struct user_struct *user,
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
- if (ctx->limit_mem)
+ if (ctx->user)
__io_unaccount_mem(ctx->user, nr_pages);
if (ctx->mm_account)
{
int ret;
- if (ctx->limit_mem) {
+ if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages);
if (ret)
return ret;
static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct io_submit_state *submit_state = &ctx->submit_state;
+ struct io_comp_state *cs = &ctx->submit_state.comp;
mutex_lock(&ctx->uring_lock);
- if (submit_state->free_reqs)
+ if (submit_state->free_reqs) {
kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
submit_state->reqs);
-
- io_req_cache_free(&submit_state->comp.free_list, NULL);
+ submit_state->free_reqs = 0;
+ }
spin_lock_irq(&ctx->completion_lock);
- io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
+ list_splice_init(&cs->locked_free_list, &cs->free_list);
+ cs->locked_free_nr = 0;
spin_unlock_irq(&ctx->completion_lock);
+ io_req_cache_free(&cs->free_list, NULL);
+
mutex_unlock(&ctx->uring_lock);
}
mutex_lock(&ctx->uring_lock);
mutex_unlock(&ctx->uring_lock);
- io_finish_async(ctx);
+ io_sq_thread_finish(ctx);
io_sqe_buffers_unregister(ctx);
- if (ctx->sqo_task) {
- put_task_struct(ctx->sqo_task);
- ctx->sqo_task = NULL;
+ if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
-#ifdef CONFIG_BLK_CGROUP
- if (ctx->sqo_blkcg_css)
- css_put(ctx->sqo_blkcg_css);
-#endif
-
+ mutex_lock(&ctx->uring_lock);
io_sqe_files_unregister(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
idr_destroy(&ctx->personality_idr);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
- put_cred(ctx->creds);
io_req_caches_free(ctx, NULL);
+ if (ctx->hash_map)
+ io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx);
}
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
- struct io_identity *iod;
+ const struct cred *creds;
- iod = idr_remove(&ctx->personality_idr, id);
- if (iod) {
- put_cred(iod->creds);
- if (refcount_dec_and_test(&iod->count))
- kfree(iod);
+ creds = idr_remove(&ctx->personality_idr, id);
+ if (creds) {
+ put_cred(creds);
return 0;
}
return 0;
}
+static void io_run_ctx_fallback(struct io_ring_ctx *ctx)
+{
+ struct callback_head *work, *head, *next;
+
+ do {
+ do {
+ head = NULL;
+ work = READ_ONCE(ctx->exit_task_work);
+ } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
+
+ if (!work)
+ break;
+
+ do {
+ next = work->next;
+ work->func(work);
+ work = next;
+ cond_resched();
+ } while (work);
+ } while (1);
+}
+
static void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
*/
do {
io_uring_try_cancel_requests(ctx, NULL, NULL);
+ io_run_ctx_fallback(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
}
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- return req->ctx == data;
-}
-
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
io_kill_timeouts(ctx, NULL, NULL);
io_poll_remove_all(ctx, NULL, NULL);
- if (ctx->io_wq)
- io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
-
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
struct files_struct *files)
{
struct io_task_cancel cancel = { .task = task, .files = files, };
+ struct io_uring_task *tctx = current->io_uring;
while (1) {
enum io_wq_cancel cret;
bool ret = false;
- if (ctx->io_wq) {
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+ if (tctx && tctx->io_wq) {
+ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
&cancel, true);
ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
}
struct files_struct *files)
{
struct task_struct *task = current;
+ bool did_park = false;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
io_disable_sqo_submit(ctx);
- task = ctx->sq_data->thread;
- atomic_inc(&task->io_uring->in_idle);
- io_sq_thread_park(ctx->sq_data);
+ did_park = io_sq_thread_park(ctx->sq_data);
+ if (did_park) {
+ task = ctx->sq_data->thread;
+ atomic_inc(&task->io_uring->in_idle);
+ }
}
io_cancel_defer_files(ctx, task, files);
if (!files)
io_uring_try_cancel_requests(ctx, task, NULL);
- if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+ if (did_park) {
atomic_dec(&task->io_uring->in_idle);
io_sq_thread_unpark(ctx->sq_data);
}
int ret;
if (unlikely(!tctx)) {
- ret = io_uring_alloc_task_context(current);
+ ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
tctx = current->io_uring;
io_uring_cancel_task_requests(file->private_data, files);
atomic_dec(&tctx->in_idle);
- if (files)
+ if (files) {
io_uring_remove_task_files(tctx);
+ if (tctx->io_wq) {
+ io_wq_put(tctx->io_wq);
+ tctx->io_wq = NULL;
+ }
+ }
}
static s64 tctx_inflight(struct io_uring_task *tctx)
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
{
+ struct io_sq_data *sqd = ctx->sq_data;
struct io_uring_task *tctx;
s64 inflight;
DEFINE_WAIT(wait);
- if (!ctx->sq_data)
+ if (!sqd)
return;
- tctx = ctx->sq_data->thread->io_uring;
io_disable_sqo_submit(ctx);
+ if (!io_sq_thread_park(sqd))
+ return;
+ tctx = ctx->sq_data->thread->io_uring;
atomic_inc(&tctx->in_idle);
do {
finish_wait(&tctx->wait, &wait);
} while (1);
atomic_dec(&tctx->in_idle);
+ io_sq_thread_unpark(sqd);
}
/*
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx = file->private_data;
+ /* Ignore helper thread files exit */
+ if (current->flags & PF_IO_WORKER)
+ return 0;
+
if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
io_uring_cancel_task_requests(ctx, NULL);
io_req_caches_free(ctx, current);
}
+ io_run_ctx_fallback(ctx);
+
if (!tctx)
return 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ if (unlikely(ctx->sqo_exec)) {
+ ret = io_sq_thread_fork(ctx->sq_data, ctx);
+ if (ret)
+ goto out;
+ ctx->sqo_exec = 0;
+ }
ret = -EOWNERDEAD;
if (unlikely(ctx->sqo_dead))
goto out;
#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
- struct io_identity *iod = p;
- const struct cred *cred = iod->creds;
+ const struct cred *cred = p;
struct seq_file *m = data;
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
*/
has_lock = mutex_trylock(&ctx->uring_lock);
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+ if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data;
+ if (!sq->thread)
+ sq = NULL;
+ }
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
- struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
struct file *file;
int ret;
p->cq_entries = 2 * p->sq_entries;
}
- user = get_uid(current_user());
-
ctx = io_ring_ctx_alloc(p);
- if (!ctx) {
- free_uid(user);
+ if (!ctx)
return -ENOMEM;
- }
ctx->compat = in_compat_syscall();
- ctx->limit_mem = !capable(CAP_IPC_LOCK);
- ctx->user = user;
- ctx->creds = get_current_cred();
-#ifdef CONFIG_AUDIT
- ctx->loginuid = current->loginuid;
- ctx->sessionid = current->sessionid;
-#endif
- ctx->sqo_task = get_task_struct(current);
+ if (!capable(CAP_IPC_LOCK))
+ ctx->user = get_uid(current_user());
+ ctx->sqo_task = current;
/*
* This is just grabbed for accounting purposes. When a process exits,
mmgrab(current->mm);
ctx->mm_account = current->mm;
-#ifdef CONFIG_BLK_CGROUP
- /*
- * The sq thread will belong to the original cgroup it was inited in.
- * If the cgroup goes offline (e.g. disabling the io controller), then
- * issued bios will be associated with the closest cgroup later in the
- * block layer.
- */
- rcu_read_lock();
- ctx->sqo_blkcg_css = blkcg_css();
- ret = css_tryget_online(ctx->sqo_blkcg_css);
- rcu_read_unlock();
- if (!ret) {
- /* don't init against a dying cgroup, have the user try again */
- ctx->sqo_blkcg_css = NULL;
- ret = -ENODEV;
- goto err;
- }
-#endif
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG;
+ IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
static int io_register_personality(struct io_ring_ctx *ctx)
{
- struct io_identity *id;
+ const struct cred *creds;
int ret;
- id = kmalloc(sizeof(*id), GFP_KERNEL);
- if (unlikely(!id))
- return -ENOMEM;
-
- io_init_identity(id);
- id->creds = get_current_cred();
+ creds = get_current_cred();
- ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
- if (ret < 0) {
- put_cred(id->creds);
- kfree(id);
- }
+ ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+ USHRT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ put_cred(creds);
return ret;
}
ctx = f.file->private_data;
+ io_run_task_work();
+
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);