enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
- IO_WQ_BIT_ERROR = 1, /* error on setup */
};
enum {
const struct cred *cur_creds;
const struct cred *saved_creds;
+ struct completion ref_done;
+ struct completion started;
+
struct rcu_head rcu;
};
struct {
raw_spinlock_t lock;
struct io_wq_work_list work_list;
- unsigned long hash_map;
unsigned flags;
} ____cacheline_aligned_in_smp;
struct hlist_nulls_head free_list;
struct list_head all_list;
+ struct wait_queue_entry wait;
+
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
struct task_struct *manager;
struct user_struct *user;
+
+ struct io_wq_hash *hash;
+
refcount_t refs;
- struct completion done;
+ struct completion started;
+ struct completion exited;
+
+ atomic_t worker_refs;
+ struct completion worker_done;
struct hlist_node cpuhp_node;
static void io_worker_release(struct io_worker *worker)
{
if (refcount_dec_and_test(&worker->ref))
- wake_up_process(worker->task);
+ complete(&worker->ref_done);
}
static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
unsigned flags;
- /*
- * If we're not at zero, someone else is holding a brief reference
- * to the worker. Wait for that to go away.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- if (!refcount_dec_and_test(&worker->ref))
- schedule();
- __set_current_state(TASK_RUNNING);
+ if (refcount_dec_and_test(&worker->ref))
+ complete(&worker->ref_done);
+ wait_for_completion(&worker->ref_done);
preempt_disable();
current->flags &= ~PF_IO_WORKER;
raw_spin_unlock_irq(&wqe->lock);
kfree_rcu(worker, rcu);
- if (refcount_dec_and_test(&wqe->wq->refs))
- complete(&wqe->wq->done);
+ if (atomic_dec_and_test(&wqe->wq->worker_refs))
+ complete(&wqe->wq->worker_done);
}
static inline bool io_wqe_run_queue(struct io_wqe *wqe)
{
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
io_wqe_inc_running(worker);
+ complete(&worker->started);
}
/*
return work->flags >> IO_WQ_HASH_SHIFT;
}
+static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
+{
+ struct io_wq *wq = wqe->wq;
+
+ spin_lock(&wq->hash->wait.lock);
+ if (list_empty(&wqe->wait.entry)) {
+ __add_wait_queue(&wq->hash->wait, &wqe->wait);
+ if (!test_bit(hash, &wq->hash->map)) {
+ __set_current_state(TASK_RUNNING);
+ list_del_init(&wqe->wait.entry);
+ }
+ }
+ spin_unlock(&wq->hash->wait.lock);
+}
+
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
- unsigned int hash;
+ unsigned int stall_hash = -1U;
wq_list_for_each(node, prev, &wqe->work_list) {
+ unsigned int hash;
+
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
return work;
}
- /* hashed, can run if not already running */
hash = io_get_work_hash(work);
- if (!(wqe->hash_map & BIT(hash))) {
- wqe->hash_map |= BIT(hash);
- /* all items with this hash lie in [work, tail] */
- tail = wqe->hash_tail[hash];
+ /* all items with this hash lie in [work, tail] */
+ tail = wqe->hash_tail[hash];
+
+ /* hashed, can run if not already running */
+ if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
+ if (stall_hash == -1U)
+ stall_hash = hash;
+ /* fast forward to a next hash, for-each will fix up @prev */
+ node = &tail->list;
+ }
+
+ if (stall_hash != -1U) {
+ raw_spin_unlock(&wqe->lock);
+ io_wait_on_hash(wqe, stall_hash);
+ raw_spin_lock(&wqe->lock);
}
return NULL;
if (!work)
break;
io_assign_current_work(worker, work);
+ __set_current_state(TASK_RUNNING);
/* handle a whole dependent link */
do {
io_wqe_enqueue(wqe, linked);
if (hash != -1U && !next_hashed) {
+ clear_bit(hash, &wq->hash->map);
+ if (wq_has_sleeper(&wq->hash->wait))
+ wake_up(&wq->hash->wait);
raw_spin_lock_irq(&wqe->lock);
- wqe->hash_map &= ~BIT_ULL(hash);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */
if (!work)
loop:
raw_spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
- __set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
goto loop;
}
struct io_worker *worker;
pid_t pid;
+ __set_current_state(TASK_RUNNING);
+
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker)
return false;
worker->nulls_node.pprev = NULL;
worker->wqe = wqe;
spin_lock_init(&worker->lock);
+ init_completion(&worker->ref_done);
+ init_completion(&worker->started);
+
+ atomic_inc(&wq->worker_refs);
if (index == IO_WQ_ACCT_BOUND)
pid = io_wq_fork_thread(task_thread_bound, worker);
else
pid = io_wq_fork_thread(task_thread_unbound, worker);
if (pid < 0) {
+ if (atomic_dec_and_test(&wq->worker_refs))
+ complete(&wq->worker_done);
kfree(worker);
return false;
}
- refcount_inc(&wq->refs);
+ wait_for_completion(&worker->started);
return true;
}
{
struct io_wqe_acct *acct = &wqe->acct[index];
+ if (acct->nr_workers && test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state))
+ return false;
/* if we have available workers or no work, no need */
if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
return false;
return false;
}
+static void io_wq_check_workers(struct io_wq *wq)
+{
+ int node;
+
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+ bool fork_worker[2] = { false, false };
+
+ if (!node_online(node))
+ continue;
+
+ raw_spin_lock_irq(&wqe->lock);
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+ fork_worker[IO_WQ_ACCT_BOUND] = true;
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+ fork_worker[IO_WQ_ACCT_UNBOUND] = true;
+ raw_spin_unlock_irq(&wqe->lock);
+ if (fork_worker[IO_WQ_ACCT_BOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+ if (fork_worker[IO_WQ_ACCT_UNBOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
+ }
+}
+
/*
* Manager thread. Tasked with creating new workers, if we need them.
*/
current->flags |= PF_IO_WORKER;
wq->manager = current;
- complete(&wq->done);
+ complete(&wq->started);
- while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- bool fork_worker[2] = { false, false };
-
- if (!node_online(node))
- continue;
-
- raw_spin_lock_irq(&wqe->lock);
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
- fork_worker[IO_WQ_ACCT_BOUND] = true;
- if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
- fork_worker[IO_WQ_ACCT_UNBOUND] = true;
- raw_spin_unlock_irq(&wqe->lock);
- if (fork_worker[IO_WQ_ACCT_BOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
- if (fork_worker[IO_WQ_ACCT_UNBOUND])
- create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
- }
+ do {
set_current_state(TASK_INTERRUPTIBLE);
+ io_wq_check_workers(wq);
schedule_timeout(HZ);
if (fatal_signal_pending(current))
set_bit(IO_WQ_BIT_EXIT, &wq->state);
- }
+ } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
- if (refcount_dec_and_test(&wq->refs)) {
- complete(&wq->done);
- do_exit(0);
- }
- /* if ERROR is set and we get here, we have workers to wake */
- if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
- rcu_read_lock();
- for_each_node(node)
- io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
- rcu_read_unlock();
- }
+ io_wq_check_workers(wq);
+
+ rcu_read_lock();
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
+ rcu_read_unlock();
+
+ /* we might not ever have created any workers */
+ if (atomic_read(&wq->worker_refs))
+ wait_for_completion(&wq->worker_done);
+ wq->manager = NULL;
+ complete(&wq->exited);
+ io_wq_put(wq);
do_exit(0);
}
wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
}
+static int io_wq_fork_manager(struct io_wq *wq)
+{
+ int ret;
+
+ if (wq->manager)
+ return 0;
+
+ reinit_completion(&wq->worker_done);
+ clear_bit(IO_WQ_BIT_EXIT, &wq->state);
+ refcount_inc(&wq->refs);
+ current->flags |= PF_IO_WORKER;
+ ret = io_wq_fork_thread(io_wq_manager, wq);
+ current->flags &= ~PF_IO_WORKER;
+ if (ret >= 0) {
+ wait_for_completion(&wq->started);
+ return 0;
+ }
+
+ io_wq_put(wq);
+ return ret;
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
+ /* Can only happen if manager creation fails after exec */
+ if (unlikely(io_wq_fork_manager(wqe->wq))) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ wqe->wq->do_work(work);
+ return;
+ }
+
work_flags = work->flags;
raw_spin_lock_irqsave(&wqe->lock, flags);
io_wqe_insert_work(wqe, work);
return IO_WQ_CANCEL_NOTFOUND;
}
+static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
+ int ret;
+
+ list_del_init(&wait->entry);
+
+ rcu_read_lock();
+ ret = io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
+ if (!ret)
+ wake_up_process(wqe->wq->manager);
+
+ return 1;
+}
+
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
if (ret)
goto err_wqes;
+ refcount_inc(&data->hash->refs);
+ wq->hash = data->hash;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC);
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+ wqe->wait.func = io_wqe_hash_wake;
+ INIT_LIST_HEAD(&wqe->wait.entry);
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
}
wq->task_pid = current->pid;
- init_completion(&wq->done);
+ init_completion(&wq->started);
+ init_completion(&wq->exited);
refcount_set(&wq->refs, 1);
- current->flags |= PF_IO_WORKER;
- ret = io_wq_fork_thread(io_wq_manager, wq);
- current->flags &= ~PF_IO_WORKER;
- if (ret >= 0) {
- wait_for_completion(&wq->done);
- reinit_completion(&wq->done);
+ init_completion(&wq->worker_done);
+ atomic_set(&wq->worker_refs, 0);
+
+ ret = io_wq_fork_manager(wq);
+ if (!ret)
return wq;
- }
- if (refcount_dec_and_test(&wq->refs))
- complete(&wq->done);
+ io_wq_put(wq);
+ io_wq_put_hash(data->hash);
err:
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
for_each_node(node)
return ERR_PTR(ret);
}
-void io_wq_destroy(struct io_wq *wq)
+static void io_wq_destroy(struct io_wq *wq)
{
int node;
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
set_bit(IO_WQ_BIT_EXIT, &wq->state);
- if (wq->manager)
+ if (wq->manager) {
wake_up_process(wq->manager);
+ wait_for_completion(&wq->exited);
+ }
- rcu_read_lock();
- for_each_node(node)
- io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
- rcu_read_unlock();
-
- wait_for_completion(&wq->done);
+ spin_lock_irq(&wq->hash->wait.lock);
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
- for_each_node(node)
- kfree(wq->wqes[node]);
+ list_del_init(&wqe->wait.entry);
+ kfree(wqe);
+ }
+ spin_unlock_irq(&wq->hash->wait.lock);
+ io_wq_put_hash(wq->hash);
kfree(wq->wqes);
kfree(wq);
+
+}
+
+void io_wq_put(struct io_wq *wq)
+{
+ if (refcount_dec_and_test(&wq->refs))
+ io_wq_destroy(wq);
}
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)