#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/kthread.h>
*/
u32 sq_dropped;
/*
- * Runtime flags
+ * Runtime SQ flags
*
* Written by the kernel, shouldn't be modified by the
* application.
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
u32 sq_flags;
+ /*
+ * Runtime CQ flags
+ *
+ * Written by the application, shouldn't be modified by the
+ * kernel.
+ */
+ u32 cq_flags;
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
struct list_head node;
struct list_head file_list;
struct fixed_file_data *file_data;
- struct work_struct work;
+ struct llist_node llist;
};
struct fixed_file_data {
const struct cred *creds;
- /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
- struct completion *completions;
+ struct completion ref_comp;
+ struct completion sq_thread_comp;
/* if all else fails... */
struct io_kiocb *fallback_req;
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
+ struct delayed_work file_put_work;
+ struct llist_head file_put_llist;
+
struct work_struct exit_work;
};
struct file *file;
u64 addr;
int flags;
- u32 count;
+ u32 off;
+ u32 target_seq;
};
struct io_rw {
struct io_open {
struct file *file;
int dfd;
- union {
- unsigned mask;
- };
struct filename *filename;
- struct statx __user *buffer;
struct open_how how;
unsigned long nofile;
};
__u16 bid;
};
+struct io_statx {
+ struct file *file;
+ int dfd;
+ unsigned int mask;
+ unsigned int flags;
+ const char __user *filename;
+ struct statx __user *buffer;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
- REQ_F_IOPOLL_COMPLETED_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
+ REQ_F_QUEUE_TIMEOUT_BIT,
+ REQ_F_WORK_INITIALIZED_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* polled IO has completed */
- REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* timeout request */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
+ /* needs to queue linked timeout */
+ REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
+ /* io_wq_work is initialized */
+ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
};
struct async_poll {
struct io_epoll epoll;
struct io_splice splice;
struct io_provide_buf pbuf;
+ struct io_statx statx;
};
struct io_async_ctx *io;
int cflags;
- bool needs_fixed_file;
u8 opcode;
+ /* polled IO has completed */
+ u8 iopoll_completed;
u16 buf_index;
unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
+ /* don't fail if file grab fails */
+ unsigned needs_file_no_error : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
+ .needs_file_no_error = 1,
.file_table = 1,
},
[IORING_OP_FILES_UPDATE] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
+ [IORING_OP_TEE] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
};
static void io_wq_submit_work(struct io_wq_work **workptr);
}
EXPORT_SYMBOL(io_uring_get_socket);
+static void io_file_put_work(struct work_struct *work);
+
+/*
+ * Note: must call io_req_init_async() for the first time you
+ * touch any members of io_wq_work.
+ */
+static inline void io_req_init_async(struct io_kiocb *req)
+{
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ return;
+
+ memset(&req->work, 0, sizeof(req->work));
+ req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
+static inline bool io_async_submit(struct io_ring_ctx *ctx)
+{
+ return ctx->flags & IORING_SETUP_SQPOLL;
+}
+
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
- complete(&ctx->completions[0]);
+ complete(&ctx->ref_comp);
}
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx->fallback_req)
goto err;
- ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
- if (!ctx->completions)
- goto err;
-
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
init_waitqueue_head(&ctx->sqo_wait);
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
- init_completion(&ctx->completions[0]);
- init_completion(&ctx->completions[1]);
+ init_completion(&ctx->ref_comp);
+ init_completion(&ctx->sq_thread_comp);
idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
+ INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
+ init_llist_head(&ctx->file_put_llist);
return ctx;
err:
if (ctx->fallback_req)
kmem_cache_free(req_cachep, ctx->fallback_req);
- kfree(ctx->completions);
kfree(ctx->cancel_hash);
kfree(ctx);
return NULL;
return false;
}
-static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
-{
- struct io_kiocb *req;
-
- req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
- if (req && !req_need_defer(req)) {
- list_del_init(&req->list);
- return req;
- }
-
- return NULL;
-}
-
-static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
-{
- struct io_kiocb *req;
-
- req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
- if (req) {
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
- return NULL;
- if (!__req_need_defer(req)) {
- list_del_init(&req->list);
- return req;
- }
- }
-
- return NULL;
-}
-
static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ return;
+
if (req->work.mm) {
mmdrop(req->work.mm);
req->work.mm = NULL;
spin_unlock_irq(&ctx->completion_lock);
}
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
- struct io_kiocb *req;
+ do {
+ struct io_kiocb *req = list_first_entry(&ctx->defer_list,
+ struct io_kiocb, list);
- while ((req = io_get_timeout_req(ctx)) != NULL)
+ if (req_need_defer(req))
+ break;
+ list_del_init(&req->list);
+ io_queue_async_work(req);
+ } while (!list_empty(&ctx->defer_list));
+}
+
+static void io_flush_timeouts(struct io_ring_ctx *ctx)
+{
+ while (!list_empty(&ctx->timeout_list)) {
+ struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
+ struct io_kiocb, list);
+
+ if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ break;
+ if (req->timeout.target_seq != ctx->cached_cq_tail
+ - atomic_read(&ctx->cq_timeouts))
+ break;
+
+ list_del_init(&req->list);
io_kill_timeout(req);
+ }
+}
+static void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ io_flush_timeouts(ctx);
__io_commit_cqring(ctx);
- while ((req = io_get_deferred_req(ctx)) != NULL)
- io_queue_async_work(req);
+ if (unlikely(!list_empty(&ctx->defer_list)))
+ __io_queue_deferred(ctx);
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
if (!ctx->cq_ev_fd)
return false;
+ if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
+ return false;
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker();
io_queue_async_work(nxt);
}
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *link;
-
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- io_queue_linked_timeout(link);
- io_wq_submit_work(workptr);
-}
-
static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
{
struct io_kiocb *link;
*workptr = &nxt->work;
link = io_prep_linked_timeout(nxt);
if (link)
- nxt->work.func = io_link_work_cb;
+ nxt->flags |= REQ_F_QUEUE_TIMEOUT;
}
/*
* If we find a request that requires polling, break out
* and complete those lists first, if we have entries there.
*/
- if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+ if (READ_ONCE(req->iopoll_completed)) {
list_move_tail(&req->list, &done);
continue;
}
req_set_fail_links(req);
req->result = res;
if (res != -EAGAIN)
- req->flags |= REQ_F_IOPOLL_COMPLETED;
+ WRITE_ONCE(req->iopoll_completed, 1);
}
/*
* For fast devices, IO may have already completed. If it has, add
* it to the front so we find it first.
*/
- if (req->flags & REQ_F_IOPOLL_COMPLETED)
+ if (READ_ONCE(req->iopoll_completed))
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
wake_up(&ctx->sqo_wait);
}
-static void io_file_put(struct io_submit_state *state)
+static void __io_state_file_put(struct io_submit_state *state)
{
- if (state->file) {
- int diff = state->has_refs - state->used_refs;
+ int diff = state->has_refs - state->used_refs;
- if (diff)
- fput_many(state->file, diff);
- state->file = NULL;
- }
+ if (diff)
+ fput_many(state->file, diff);
+ state->file = NULL;
+}
+
+static inline void io_state_file_put(struct io_submit_state *state)
+{
+ if (state->file)
+ __io_state_file_put(state);
}
/*
state->ios_left--;
return state->file;
}
- io_file_put(state);
+ __io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
if (!state->file)
if (S_ISREG(mode) && file->f_op != &io_uring_fops)
return true;
+ /* any ->read/write should understand O_NONBLOCK */
+ if (file->f_flags & O_NONBLOCK)
+ return true;
+
if (!(file->f_mode & FMODE_NOWAIT))
return false;
kiocb->ki_ioprio = get_current_ioprio();
/* don't allow async punt if RWF_NOWAIT was requested */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- (req->file->f_flags & O_NONBLOCK))
+ if (kiocb->ki_flags & IOCB_NOWAIT)
req->flags |= REQ_F_NOWAIT;
if (force_nonblock)
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->result = 0;
+ req->iopoll_completed = 0;
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
bool needs_lock)
{
- if (req->flags & REQ_F_BUFFER_SELECTED)
+ if (req->flags & REQ_F_BUFFER_SELECTED) {
+ struct io_buffer *kbuf;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov[0].iov_len = kbuf->len;
return 0;
+ }
if (!req->rw.len)
return 0;
else if (req->rw.len > 1)
if (ret)
goto out_free;
/* any defer here is final, must blocking retry */
- if (!file_can_poll(req->file))
+ if (!(req->flags & REQ_F_NOWAIT) &&
+ !file_can_poll(req->file))
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
return ret;
}
-static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_splice_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
struct io_splice* sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
sp->file_in = NULL;
- sp->off_in = READ_ONCE(sqe->splice_off_in);
- sp->off_out = READ_ONCE(sqe->off);
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
return ret;
req->flags |= REQ_F_NEED_CLEANUP;
- if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+ if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+ /*
+ * Splice operation will be punted aync, and here need to
+ * modify io_wq_work.flags, so initialize io_wq_work firstly.
+ */
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_UNBOUND;
+ }
+
+ return 0;
+}
+
+static int io_tee_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
+ return -EINVAL;
+ return __io_splice_prep(req, sqe);
+}
+
+static int io_tee(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_splice *sp = &req->splice;
+ struct file *in = sp->file_in;
+ struct file *out = sp->file_out;
+ unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ long ret = 0;
+
+ if (force_nonblock)
+ return -EAGAIN;
+ if (sp->len)
+ ret = do_tee(in, out, sp->len, flags);
+
+ io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_cqring_add_event(req, ret);
+ if (ret != sp->len)
+ req_set_fail_links(req);
+ io_put_req(req);
return 0;
}
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_splice* sp = &req->splice;
+
+ sp->off_in = READ_ONCE(sqe->splice_off_in);
+ sp->off_out = READ_ONCE(sqe->off);
+ return __io_splice_prep(req, sqe);
+}
+
static int io_splice(struct io_kiocb *req, bool force_nonblock)
{
struct io_splice *sp = &req->splice;
return 0;
}
-static bool io_req_cancelled(struct io_kiocb *req)
-{
- if (req->work.flags & IO_WQ_WORK_CANCEL) {
- req_set_fail_links(req);
- io_cqring_add_event(req, -ECANCELED);
- io_put_req(req);
- return true;
- }
-
- return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
{
loff_t end = req->sync.off + req->sync.len;
int ret;
+ /* fsync always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fsync(req);
- io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
- /* fsync always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_fsync_finish;
- return -EAGAIN;
- }
- __io_fsync(req);
return 0;
}
-static void __io_fallocate(struct io_kiocb *req)
-{
- int ret;
-
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
- ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
- req->sync.len);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_fallocate(req);
- io_steal_work(req, workptr);
-}
-
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
+ int ret;
+
/* fallocate always requiring blocking context */
- if (force_nonblock) {
- req->work.func = io_fallocate_finish;
+ if (force_nonblock)
return -EAGAIN;
- }
- __io_fallocate(req);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+ ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+ req->sync.len);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
return 0;
}
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *fname;
int ret;
- if (sqe->ioprio || sqe->buf_index)
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
+ if (unlikely(sqe->ioprio || sqe->buf_index))
+ return -EINVAL;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
- req->open.dfd = READ_ONCE(sqe->fd);
- req->open.how.mode = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.how.flags = READ_ONCE(sqe->open_flags);
- if (force_o_largefile())
+ /* open.how should be already initialised */
+ if (!(req->open.how.flags & O_PATH) && force_o_largefile())
req->open.how.flags |= O_LARGEFILE;
+ req->open.dfd = READ_ONCE(sqe->fd);
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
-
req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ u64 flags, mode;
+
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
+ mode = READ_ONCE(sqe->len);
+ flags = READ_ONCE(sqe->open_flags);
+ req->open.how = build_open_how(flags, mode);
+ return __io_openat_prep(req, sqe);
+}
+
static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct open_how __user *how;
- const char __user *fname;
size_t len;
int ret;
- if (sqe->ioprio || sqe->buf_index)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
-
- req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
-
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
if (ret)
return ret;
- if (!(req->open.how.flags & O_PATH) && force_o_largefile())
- req->open.how.flags |= O_LARGEFILE;
-
- req->open.filename = getname(fname);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->open.nofile = rlimit(RLIMIT_NOFILE);
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
+ return __io_openat_prep(req, sqe);
}
static int io_openat2(struct io_kiocb *req, bool force_nonblock)
static int io_openat(struct io_kiocb *req, bool force_nonblock)
{
- req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
return io_openat2(req, force_nonblock);
}
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
- if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+ if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
req->epoll.op = READ_ONCE(sqe->len);
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
if (sqe->ioprio || sqe->buf_index || sqe->off)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
req->madvise.len = READ_ONCE(sqe->len);
{
if (sqe->ioprio || sqe->buf_index || sqe->addr)
return -EINVAL;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
req->fadvise.len = READ_ONCE(sqe->len);
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- const char __user *fname;
- unsigned lookup_flags;
- int ret;
-
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
- if (req->flags & REQ_F_NEED_CLEANUP)
- return 0;
- req->open.dfd = READ_ONCE(sqe->fd);
- req->open.mask = READ_ONCE(sqe->len);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- req->open.how.flags = READ_ONCE(sqe->statx_flags);
+ req->statx.dfd = READ_ONCE(sqe->fd);
+ req->statx.mask = READ_ONCE(sqe->len);
+ req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ req->statx.flags = READ_ONCE(sqe->statx_flags);
- if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
- return -EINVAL;
-
- req->open.filename = getname_flags(fname, lookup_flags, NULL);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
static int io_statx(struct io_kiocb *req, bool force_nonblock)
{
- struct io_open *ctx = &req->open;
- unsigned lookup_flags;
- struct path path;
- struct kstat stat;
+ struct io_statx *ctx = &req->statx;
int ret;
if (force_nonblock) {
return -EAGAIN;
}
- if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
- return -EINVAL;
-
-retry:
- /* filename_lookup() drops it, keep a reference */
- ctx->filename->refcnt++;
-
- ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
- NULL);
- if (ret)
- goto err;
+ ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
+ ctx->buffer);
- ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
- path_put(&path);
- if (retry_estale(ret, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- if (!ret)
- ret = cp_statx(&stat, ctx->buffer);
-err:
- putname(ctx->filename);
- req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
{
/*
* If we queue this for async, it must not be cancellable. That would
- * leave the 'file' in an undeterminate state.
+ * leave the 'file' in an undeterminate state, and here need to modify
+ * io_wq_work.flags, so initialize io_wq_work firstly.
*/
+ io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- if (req->file->f_op == &io_uring_fops ||
+ if ((req->file && req->file->f_op == &io_uring_fops) ||
req->close.fd == req->ctx->ring_fd)
return -EBADF;
+ req->close.put_file = NULL;
return 0;
}
-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
- int ret;
-
- ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- fput(req->close.put_file);
- io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- /* not cancellable, don't do io_req_cancelled() */
- __io_close_finish(req);
- io_steal_work(req, workptr);
-}
-
static int io_close(struct io_kiocb *req, bool force_nonblock)
{
+ struct io_close *close = &req->close;
int ret;
- req->close.put_file = NULL;
- ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
- if (ret < 0)
- return ret;
+ /* might be already done during nonblock submission */
+ if (!close->put_file) {
+ ret = __close_fd_get_file(close->fd, &close->put_file);
+ if (ret < 0)
+ return (ret == -ENOENT) ? -EBADF : ret;
+ }
/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && force_nonblock) {
- /* submission ref will be dropped, take it for async */
- refcount_inc(&req->refs);
-
- req->work.func = io_close_finish;
- /*
- * Do manual async queue here to avoid grabbing files - we don't
- * need the files, and it'll cause io_close_finish() to close
- * the file again and cause a double CQE entry for this request
- */
- io_queue_async_work(req);
- return 0;
+ if (close->put_file->f_op->flush && force_nonblock) {
+ /* avoid grabbing files - we don't need the files */
+ req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
+ return -EAGAIN;
}
- /*
- * No ->flush(), safely close from here and just punt the
- * fput() to async context.
- */
- __io_close_finish(req);
+ /* No ->flush() or already async, safely close from here */
+ ret = filp_close(close->put_file, req->work.files);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ fput(close->put_file);
+ close->put_file = NULL;
+ io_put_req(req);
return 0;
}
return 0;
}
-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
int ret;
+ /* sync_file_range always requires a blocking context */
+ if (force_nonblock)
+ return -EAGAIN;
+
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_sync_file_range(req);
- io_steal_work(req, workptr);
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
- /* sync_file_range always requires a blocking context */
- if (force_nonblock) {
- req->work.func = io_sync_file_range_finish;
- return -EAGAIN;
- }
-
- __io_sync_file_range(req);
return 0;
}
struct io_async_ctx *io = req->io;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
struct socket *sock;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
struct socket *sock;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
int ret;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
struct socket *sock;
int ret, cflags = 0;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_buffer *kbuf;
struct socket *sock;
int ret, cflags = 0;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
return 0;
}
-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
- unsigned file_flags;
+ unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
int ret;
- file_flags = force_nonblock ? O_NONBLOCK : 0;
+ if (req->file->f_flags & O_NONBLOCK)
+ req->flags |= REQ_F_NOWAIT;
+
ret = __sys_accept4_file(req->file, file_flags, accept->addr,
accept->addr_len, accept->flags,
accept->nofile);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
req_set_fail_links(req);
+ }
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}
-static void io_accept_finish(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- if (io_req_cancelled(req))
- return;
- __io_accept(req, false);
- io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
- int ret;
-
- ret = __io_accept(req, force_nonblock);
- if (ret == -EAGAIN && force_nonblock) {
- req->work.func = io_accept_finish;
- return -EAGAIN;
- }
- return 0;
-}
-
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- return -EOPNOTSUPP;
-}
-
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
-{
- return -EOPNOTSUPP;
-}
-#endif /* CONFIG_NET */
-
-struct io_poll_table {
- struct poll_table_struct pt;
- struct io_kiocb *req;
- int error;
-};
-
-static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head)
-{
- if (unlikely(poll->head)) {
- pt->error = -EINVAL;
- return;
- }
-
- pt->error = 0;
- poll->head = head;
- add_wait_queue(head, &poll->wait);
+ return -EOPNOTSUPP;
}
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-
- __io_queue_proc(&pt->req->apoll->poll, pt, head);
+ return -EOPNOTSUPP;
}
+#endif /* CONFIG_NET */
+
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
return false;
}
+static void io_poll_remove_double(struct io_kiocb *req)
+{
+ struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+
+ lockdep_assert_held(&req->ctx->completion_lock);
+
+ if (poll && poll->head) {
+ struct wait_queue_head *head = poll->head;
+
+ spin_lock(&head->lock);
+ list_del_init(&poll->wait.entry);
+ if (poll->wait.private)
+ refcount_dec(&req->refs);
+ poll->head = NULL;
+ spin_unlock(&head->lock);
+ }
+}
+
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_poll_remove_double(req);
+ req->poll.done = true;
+ io_cqring_fill_event(req, error ? error : mangle_poll(mask));
+ io_commit_cqring(ctx);
+}
+
+static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (io_poll_rewait(req, &req->poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
+
+ hash_del(&req->hash_node);
+ io_poll_complete(req, req->result, 0);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req_find_next(req, nxt);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+}
+
+static void io_poll_task_func(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_kiocb *nxt = NULL;
+
+ io_poll_task_handler(req, &nxt);
+ if (nxt) {
+ struct io_ring_ctx *ctx = nxt->ctx;
+
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(nxt, NULL);
+ mutex_unlock(&ctx->uring_lock);
+ }
+}
+
+static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct io_kiocb *req = wait->private;
+ struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
+ __poll_t mask = key_to_poll(key);
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
+
+ if (req->poll.head) {
+ bool done;
+
+ spin_lock(&req->poll.head->lock);
+ done = list_empty(&req->poll.wait.entry);
+ if (!done)
+ list_del_init(&req->poll.wait.entry);
+ spin_unlock(&req->poll.head->lock);
+ if (!done)
+ __io_async_wake(req, poll, mask, io_poll_task_func);
+ }
+ refcount_dec(&req->refs);
+ return 1;
+}
+
+static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
+ wait_queue_func_t wake_func)
+{
+ poll->head = NULL;
+ poll->done = false;
+ poll->canceled = false;
+ poll->events = events;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, wake_func);
+}
+
+static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
+ struct wait_queue_head *head)
+{
+ struct io_kiocb *req = pt->req;
+
+ /*
+ * If poll->head is already set, it's because the file being polled
+ * uses multiple waitqueues for poll handling (eg one for read, one
+ * for write). Setup a separate io_poll_iocb if this happens.
+ */
+ if (unlikely(poll->head)) {
+ /* already have a 2nd entry, fail a third attempt */
+ if (req->io) {
+ pt->error = -EINVAL;
+ return;
+ }
+ poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
+ if (!poll) {
+ pt->error = -ENOMEM;
+ return;
+ }
+ io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
+ refcount_inc(&req->refs);
+ poll->wait.private = req;
+ req->io = (void *) poll;
+ }
+
+ pt->error = 0;
+ poll->head = head;
+ add_wait_queue(head, &poll->wait);
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ __io_queue_proc(&pt->req->apoll->poll, pt, head);
+}
+
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
- bool canceled;
+ bool canceled = false;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
return;
}
- if (hash_hashed(&req->hash_node))
+ /* If req is still hashed, it cannot have been canceled. Don't check. */
+ if (hash_hashed(&req->hash_node)) {
hash_del(&req->hash_node);
-
- canceled = READ_ONCE(apoll->poll.canceled);
- if (canceled) {
- io_cqring_fill_event(req, -ECANCELED);
- io_commit_cqring(ctx);
+ } else {
+ canceled = READ_ONCE(apoll->poll.canceled);
+ if (canceled) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
+ }
}
spin_unlock_irq(&ctx->completion_lock);
/* restore ->work in case we need to retry again */
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll);
- if (canceled) {
- kfree(apoll);
+ if (!canceled) {
+ __set_current_state(TASK_RUNNING);
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL);
+ mutex_unlock(&ctx->uring_lock);
+ } else {
io_cqring_ev_posted(ctx);
req_set_fail_links(req);
io_double_put_req(req);
- return;
}
-
- __set_current_state(TASK_RUNNING);
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(req, NULL);
- mutex_unlock(&ctx->uring_lock);
-
- kfree(apoll);
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
bool cancel = false;
poll->file = req->file;
- poll->head = NULL;
- poll->done = poll->canceled = false;
- poll->events = mask;
+ io_init_poll_iocb(poll, mask, wake_func);
+ poll->wait.private = req;
ipt->pt._key = mask;
ipt->req = req;
ipt->error = -EINVAL;
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, wake_func);
- poll->wait.private = req;
-
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
spin_lock_irq(&ctx->completion_lock);
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask, ret;
+ bool had_io;
if (!req->file || !file_can_poll(req->file))
return false;
return false;
req->flags |= REQ_F_POLLED;
- memcpy(&apoll->work, &req->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
+ had_io = req->io != NULL;
get_task_struct(current);
req->task = current;
io_async_wake);
if (ret) {
ipt.error = 0;
- apoll->poll.done = true;
+ /* only remove double add if we did it here */
+ if (!had_io)
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
- memcpy(&req->work, &apoll->work, sizeof(req->work));
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
kfree(apoll);
return false;
}
do_complete = true;
}
spin_unlock(&poll->head->lock);
+ hash_del(&req->hash_node);
return do_complete;
}
static bool io_poll_remove_one(struct io_kiocb *req)
{
- struct async_poll *apoll = NULL;
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
+ io_poll_remove_double(req);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
- apoll = req->apoll;
+ struct async_poll *apoll = req->apoll;
+
/* non-poll requests have submit ref still */
- do_complete = __io_poll_remove_one(req, &req->apoll->poll);
- if (do_complete)
+ do_complete = __io_poll_remove_one(req, &apoll->poll);
+ if (do_complete) {
io_put_req(req);
- }
-
- hash_del(&req->hash_node);
-
- if (do_complete && apoll) {
- /*
- * restore ->work because we need to call io_req_work_drop_env.
- */
- memcpy(&req->work, &apoll->work, sizeof(req->work));
- kfree(apoll);
+ /*
+ * restore ->work because we will call
+ * io_req_work_drop_env below when dropping the
+ * final reference.
+ */
+ if (req->flags & REQ_F_WORK_INITIALIZED)
+ memcpy(&req->work, &apoll->work,
+ sizeof(req->work));
+ kfree(apoll);
+ }
}
if (do_complete) {
return 0;
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- req->poll.done = true;
- io_cqring_fill_event(req, error ? error : mangle_poll(mask));
- io_commit_cqring(ctx);
-}
-
-static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_poll_iocb *poll = &req->poll;
-
- if (io_poll_rewait(req, poll)) {
- spin_unlock_irq(&ctx->completion_lock);
- return;
- }
-
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req_find_next(req, nxt);
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
-}
-
-static void io_poll_task_func(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_kiocb *nxt = NULL;
-
- io_poll_task_handler(req, &nxt);
- if (nxt) {
- struct io_ring_ctx *ctx = nxt->ctx;
-
- mutex_lock(&ctx->uring_lock);
- __io_queue_sqe(nxt, NULL);
- mutex_unlock(&ctx->uring_lock);
- }
-}
-
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
* We could be racing with timeout deletion. If the list is empty,
* then timeout lookup already found it and will be handling it.
*/
- if (!list_empty(&req->list)) {
- struct io_kiocb *prev;
-
- /*
- * Adjust the reqs sequence before the current one because it
- * will consume a slot in the cq_ring and the cq_tail
- * pointer will be increased, otherwise other timeout reqs may
- * return in advance without waiting for enough wait_nr.
- */
- prev = req;
- list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
- prev->sequence++;
+ if (!list_empty(&req->list))
list_del_init(&req->list);
- }
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
{
struct io_timeout_data *data;
unsigned flags;
+ u32 off = READ_ONCE(sqe->off);
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
return -EINVAL;
- if (sqe->off && is_timeout_link)
+ if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
- req->timeout.count = READ_ONCE(sqe->off);
+ req->timeout.off = off;
if (!req->io && io_alloc_async_ctx(req))
return -ENOMEM;
static int io_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data;
+ struct io_timeout_data *data = &req->io->timeout;
struct list_head *entry;
- unsigned span = 0;
- u32 count = req->timeout.count;
- u32 seq = req->sequence;
+ u32 tail, off = req->timeout.off;
- data = &req->io->timeout;
+ spin_lock_irq(&ctx->completion_lock);
/*
* sqe->off holds how many events that need to occur for this
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- if (!count) {
+ if (!off) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
- spin_lock_irq(&ctx->completion_lock);
entry = ctx->timeout_list.prev;
goto add;
}
- req->sequence = seq + count;
+ tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ req->timeout.target_seq = tail + off;
/*
* Insertion sort, ensuring the first entry in the list is always
* the one we need first.
*/
- spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned nxt_seq;
- long long tmp, tmp_nxt;
- u32 nxt_offset = nxt->timeout.count;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
-
- /*
- * Since seq + count can overflow, use type long
- * long to store it.
- */
- tmp = (long long)seq + count;
- nxt_seq = nxt->sequence - nxt_offset;
- tmp_nxt = (long long)nxt_seq + nxt_offset;
-
- /*
- * cached_sq_head may overflow, and it will never overflow twice
- * once there is some timeout req still be valid.
- */
- if (seq < nxt_seq)
- tmp += UINT_MAX;
-
- if (tmp > tmp_nxt)
+ /* nxt.seq is behind @tail, otherwise would've been completed */
+ if (off >= nxt->timeout.target_seq - tail)
break;
-
- /*
- * Sequence of reqs after the insert one and itself should
- * be adjusted because each timeout req consumes a slot.
- */
- span++;
- nxt->sequence++;
}
- req->sequence -= span;
add:
list_add(&req->list, entry);
data->timer.function = io_timeout_fn;
if (!sqe)
return 0;
+ io_req_init_async(req);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
case IORING_OP_REMOVE_BUFFERS:
ret = io_remove_buffers_prep(req, sqe);
break;
+ case IORING_OP_TEE:
+ ret = io_tee_prep(req, sqe);
+ break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
break;
case IORING_OP_OPENAT:
case IORING_OP_OPENAT2:
- case IORING_OP_STATX:
- putname(req->open.filename);
break;
case IORING_OP_SPLICE:
+ case IORING_OP_TEE:
io_put_file(req, req->splice.file_in,
(req->splice.flags & SPLICE_F_FD_IN_FIXED));
break;
}
ret = io_remove_buffers(req, force_nonblock);
break;
+ case IORING_OP_TEE:
+ if (sqe) {
+ ret = io_tee_prep(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_tee(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
return 0;
}
+static void io_arm_async_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ /* link head's timeout is queued in io_queue_async_work() */
+ if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+ return;
+
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ io_queue_linked_timeout(link);
+}
+
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
int ret = 0;
+ io_arm_async_linked_timeout(req);
+
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
IO_WQ_WORK_CANCEL) {
struct fixed_file_table *table;
table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
- return table->files[index & IORING_FILE_TABLE_MASK];;
+ return table->files[index & IORING_FILE_TABLE_MASK];
}
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (!file)
- return -EBADF;
- req->fixed_file_refs = ctx->file_data->cur_refs;
- percpu_ref_get(req->fixed_file_refs);
+ if (file) {
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
- if (unlikely(!file))
- return -EBADF;
}
- *out_file = file;
- return 0;
+ if (file || io_op_defs[req->opcode].needs_file_no_error) {
+ *out_file = file;
+ return 0;
+ }
+ return -EBADF;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
bool fixed;
fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
- if (unlikely(!fixed && req->needs_fixed_file))
+ if (unlikely(!fixed && io_async_submit(req->ctx)))
return -EBADF;
return io_file_get(state, req, fd, &req->file, fixed);
again:
linked_timeout = io_prep_linked_timeout(req);
- if (req->work.creds && req->work.creds != current_cred()) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+ req->work.creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
if (old_creds == req->work.creds)
goto exit;
}
punt:
+ io_req_init_async(req);
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
}
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_submit_state *state, struct io_kiocb **link)
+ struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
static void io_submit_state_end(struct io_submit_state *state)
{
blk_finish_plug(&state->plug);
- io_file_put(state);
+ io_state_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
}
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe,
- struct io_submit_state *state, bool async)
+ struct io_submit_state *state)
{
unsigned int sqe_flags;
int id;
refcount_set(&req->refs, 2);
req->task = NULL;
req->result = 0;
- req->needs_fixed_file = async;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
if (io_op_defs[req->opcode].needs_mm && !current->mm) {
if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
return -EFAULT;
- use_mm(ctx->sqo_mm);
+ kthread_use_mm(ctx->sqo_mm);
}
sqe_flags = READ_ONCE(sqe->flags);
id = READ_ONCE(sqe->personality);
if (id) {
+ io_req_init_async(req);
req->work.creds = idr_find(&ctx->personality_idr, id);
if (unlikely(!req->work.creds))
return -EINVAL;
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
- IOSQE_ASYNC | IOSQE_FIXED_FILE |
- IOSQE_BUFFER_SELECT | IOSQE_IO_LINK);
+ req->flags |= sqe_flags;
if (!io_op_defs[req->opcode].needs_file)
return 0;
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd, bool async)
+ struct file *ring_file, int ring_fd)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
break;
}
- err = io_init_req(ctx, req, sqe, statep, async);
+ err = io_init_req(ctx, req, sqe, statep);
io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
}
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, async);
- err = io_submit_sqe(req, sqe, statep, &link);
+ true, io_async_submit(ctx));
+ err = io_submit_sqe(req, sqe, &link);
if (err)
goto fail_req;
}
struct mm_struct *mm = current->mm;
if (mm) {
- unuse_mm(mm);
+ kthread_unuse_mm(mm);
mmput(mm);
}
}
{
struct io_ring_ctx *ctx = data;
const struct cred *old_cred;
- mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned long timeout;
int ret = 0;
- complete(&ctx->completions[1]);
+ complete(&ctx->sq_thread_comp);
- old_fs = get_fs();
- set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
timeout = jiffies + ctx->sq_thread_idle;
}
mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
+ if (likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1);
mutex_unlock(&ctx->uring_lock);
timeout = jiffies + ctx->sq_thread_idle;
}
if (current->task_works)
task_work_run();
- set_fs(old_fs);
io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
struct fixed_file_data *data = ctx->file_data;
struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
- unsigned long flags;
if (!data)
return -ENXIO;
- spin_lock_irqsave(&data->lock, flags);
+ spin_lock(&data->lock);
if (!list_empty(&data->ref_list))
ref_node = list_first_entry(&data->ref_list,
struct fixed_file_ref_node, node);
- spin_unlock_irqrestore(&data->lock, flags);
+ spin_unlock(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
percpu_ref_kill(&data->refs);
/* wait for all refs nodes to complete */
+ flush_delayed_work(&ctx->file_put_work);
wait_for_completion(&data->done);
__io_sqe_files_unregister(ctx);
static void io_sq_thread_stop(struct io_ring_ctx *ctx)
{
if (ctx->sqo_thread) {
- wait_for_completion(&ctx->completions[1]);
+ wait_for_completion(&ctx->sq_thread_comp);
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity
struct file *file;
};
-static void io_file_put_work(struct work_struct *work)
+static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
{
- struct fixed_file_ref_node *ref_node;
- struct fixed_file_data *file_data;
- struct io_ring_ctx *ctx;
+ struct fixed_file_data *file_data = ref_node->file_data;
+ struct io_ring_ctx *ctx = file_data->ctx;
struct io_file_put *pfile, *tmp;
- unsigned long flags;
-
- ref_node = container_of(work, struct fixed_file_ref_node, work);
- file_data = ref_node->file_data;
- ctx = file_data->ctx;
list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
- list_del_init(&pfile->list);
+ list_del(&pfile->list);
io_ring_file_put(ctx, pfile->file);
kfree(pfile);
}
- spin_lock_irqsave(&file_data->lock, flags);
- list_del_init(&ref_node->node);
- spin_unlock_irqrestore(&file_data->lock, flags);
+ spin_lock(&file_data->lock);
+ list_del(&ref_node->node);
+ spin_unlock(&file_data->lock);
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
percpu_ref_put(&file_data->refs);
}
+static void io_file_put_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+ struct llist_node *node;
+
+ ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
+ node = llist_del_all(&ctx->file_put_llist);
+
+ while (node) {
+ struct fixed_file_ref_node *ref_node;
+ struct llist_node *next = node->next;
+
+ ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
+ __io_file_put_work(ref_node);
+ node = next;
+ }
+}
+
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_ref_node *ref_node;
+ struct io_ring_ctx *ctx;
+ bool first_add;
+ int delay = HZ;
ref_node = container_of(ref, struct fixed_file_ref_node, refs);
+ ctx = ref_node->file_data->ctx;
+
+ if (percpu_ref_is_dying(&ctx->file_data->refs))
+ delay = 0;
- queue_work(system_wq, &ref_node->work);
+ first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
+ if (!delay)
+ mod_delayed_work(system_wq, &ctx->file_put_work, 0);
+ else if (first_add)
+ queue_delayed_work(system_wq, &ctx->file_put_work, delay);
}
static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
}
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->file_list);
- INIT_WORK(&ref_node->work, io_file_put_work);
ref_node->file_data = ctx->file_data;
return ref_node;
-
}
static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
int fd, ret = 0;
unsigned i;
struct fixed_file_ref_node *ref_node;
- unsigned long flags;
if (ctx->file_data)
return -EBUSY;
}
ctx->file_data->cur_refs = &ref_node->refs;
- spin_lock_irqsave(&ctx->file_data->lock, flags);
+ spin_lock(&ctx->file_data->lock);
list_add(&ref_node->node, &ctx->file_data->ref_list);
- spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ spin_unlock(&ctx->file_data->lock);
percpu_ref_get(&ctx->file_data->refs);
return ret;
}
__s32 __user *fds;
int fd, i, err;
__u32 done;
- unsigned long flags;
bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
if (needs_switch) {
percpu_ref_kill(data->cur_refs);
- spin_lock_irqsave(&data->lock, flags);
+ spin_lock(&data->lock);
list_add(&ref_node->node, &data->ref_list);
data->cur_refs = &ref_node->refs;
- spin_unlock_irqrestore(&data->lock, flags);
+ spin_unlock(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
destroy_fixed_file_ref_node(ref_node);
data.user = ctx->user;
data.free_work = io_free_work;
+ data.do_work = io_wq_submit_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
ret = 0;
if (!pages || nr_pages > got_pages) {
- kfree(vmas);
- kfree(pages);
+ kvfree(vmas);
+ kvfree(pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
vmas = kvmalloc_array(nr_pages,
}
ret = 0;
- down_read(¤t->mm->mmap_sem);
+ mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages,
FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
} else {
ret = pret < 0 ? pret : -EFAULT;
}
- up_read(¤t->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (ret) {
/*
* if we did partial map, or found file backed vmas,
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
put_cred(ctx->creds);
- kfree(ctx->completions);
kfree(ctx->cancel_hash);
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
- wait_for_completion(&ctx->completions[0]);
+ wait_for_completion(&ctx->ref_comp);
io_ring_ctx_free(ctx);
}
percpu_ref_kill(&ctx->refs);
mutex_unlock(&ctx->uring_lock);
- /*
- * Wait for sq thread to idle, if we have one. It won't spin on new
- * work after we've killed the ctx ref above. This is important to do
- * before we cancel existing commands, as the thread could otherwise
- * be queueing new work post that. If that's work we need to cancel,
- * it could cause shutdown to hang.
- */
- while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
- cond_resched();
-
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
* all we had, then we're done with this request.
*/
if (refcount_sub_and_test(2, &cancel_req->refs)) {
- io_put_req(cancel_req);
+ io_free_req(cancel_req);
finish_wait(&ctx->inflight_wait, &wait);
continue;
}
+ } else {
+ io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+ io_put_req(cancel_req);
}
- io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
- io_put_req(cancel_req);
schedule();
finish_wait(&ctx->inflight_wait, &wait);
}
submitted = to_submit;
} else if (to_submit) {
mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)
p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
+ p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
* after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
- ret = wait_for_completion_interruptible(&ctx->completions[0]);
+ ret = wait_for_completion_interruptible(&ctx->ref_comp);
mutex_lock(&ctx->uring_lock);
if (ret) {
percpu_ref_resurrect(&ctx->refs);
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
out:
- reinit_completion(&ctx->completions[0]);
+ reinit_completion(&ctx->ref_comp);
}
return ret;
}