fs/io-wq.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Basic worker thread pool for io_uring
   4  *
   5  * Copyright (C) 2019 Jens Axboe
   6  *
   7  */
   8 #include <linux/kernel.h>
   9 #include <linux/init.h>
  10 #include <linux/errno.h>
  11 #include <linux/sched/signal.h>
  12 #include <linux/mm.h>
  13 #include <linux/sched/mm.h>
  14 #include <linux/percpu.h>
  15 #include <linux/slab.h>
  16 #include <linux/kthread.h>
  17 #include <linux/rculist_nulls.h>
  18 #include <linux/fs_struct.h>
  19 #include <linux/blk-cgroup.h>
  20 #include <linux/audit.h>
  21 #include <linux/cpu.h>
  22
  23 #include "../kernel/sched/sched.h"
  24 #include "io-wq.h"
  25
  26 #define WORKER_IDLE_TIMEOUT     (5 * HZ)
  27
  28 enum {
  29         IO_WORKER_F_UP          = 1,    /* up and active */
  30         IO_WORKER_F_RUNNING     = 2,    /* account as running */
  31         IO_WORKER_F_FREE        = 4,    /* worker on free list */
  32         IO_WORKER_F_FIXED       = 8,    /* static idle worker */
  33         IO_WORKER_F_BOUND       = 16,   /* is doing bounded work */
  34 };
  35
  36 enum {
  37         IO_WQ_BIT_EXIT          = 0,    /* wq exiting */
  38         IO_WQ_BIT_ERROR         = 1,    /* error on setup */
  39 };
  40
  41 enum {
  42         IO_WQE_FLAG_STALLED     = 1,    /* stalled on hash */
  43 };
  44
  45 /*
  46  * One for each thread in a wqe pool
  47  */
  48 struct io_worker {
  49         refcount_t ref;
  50         unsigned flags;
  51         struct hlist_nulls_node nulls_node;
  52         struct list_head all_list;
  53         struct task_struct *task;
  54         struct io_wqe *wqe;
  55
  56         struct io_wq_work *cur_work;
  57         spinlock_t lock;
  58
  59         struct rcu_head rcu;
  60         struct mm_struct *mm;
  61 #ifdef CONFIG_BLK_CGROUP
  62         struct cgroup_subsys_state *blkcg_css;
  63 #endif
  64         const struct cred *cur_creds;
  65         const struct cred *saved_creds;
  66         struct nsproxy *restore_nsproxy;
  67 };
  68
  69 #if BITS_PER_LONG == 64
  70 #define IO_WQ_HASH_ORDER        6
  71 #else
  72 #define IO_WQ_HASH_ORDER        5
  73 #endif
  74
  75 #define IO_WQ_NR_HASH_BUCKETS   (1u << IO_WQ_HASH_ORDER)
  76
  77 struct io_wqe_acct {
  78         unsigned nr_workers;
  79         unsigned max_workers;
  80         atomic_t nr_running;
  81 };
  82
  83 enum {
  84         IO_WQ_ACCT_BOUND,
  85         IO_WQ_ACCT_UNBOUND,
  86 };
  87
  88 /*
  89  * Per-node worker thread pool
  90  */
  91 struct io_wqe {
  92         struct {
  93                 raw_spinlock_t lock;
  94                 struct io_wq_work_list work_list;
  95                 unsigned long hash_map;
  96                 unsigned flags;
  97         } ____cacheline_aligned_in_smp;
  98
  99         int node;
 100         struct io_wqe_acct acct[2];
 101
 102         struct hlist_nulls_head free_list;
 103         struct list_head all_list;
 104
 105         struct io_wq *wq;
 106         struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
 107 };
 108
 109 /*
 110  * Per io_wq state
 111   */
 112 struct io_wq {
 113         struct io_wqe **wqes;
 114         unsigned long state;
 115
 116         free_work_fn *free_work;
 117         io_wq_work_fn *do_work;
 118
 119         struct task_struct *manager;
 120         struct user_struct *user;
 121         refcount_t refs;
 122         struct completion done;
 123
 124         struct hlist_node cpuhp_node;
 125 };
 126
 127 static enum cpuhp_state io_wq_online;
 128
 129 static bool io_worker_get(struct io_worker *worker)
 130 {
 131         return refcount_inc_not_zero(&worker->ref);
 132 }
 133
 134 static void io_worker_release(struct io_worker *worker)
 135 {
 136         if (refcount_dec_and_test(&worker->ref))
 137                 wake_up_process(worker->task);
 138 }
 139
 140 /*
 141  * Note: drops the wqe->lock if returning true! The caller must re-acquire
 142  * the lock in that case. Some callers need to restart handling if this
 143  * happens, so we can't just re-acquire the lock on behalf of the caller.
 144  */
 145 static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 146 {
 147         bool dropped_lock = false;
 148
 149         if (worker->saved_creds) {
 150                 revert_creds(worker->saved_creds);
 151                 worker->cur_creds = worker->saved_creds = NULL;
 152         }
 153
 154         if (current->files) {
 155                 __acquire(&wqe->lock);
 156                 raw_spin_unlock_irq(&wqe->lock);
 157                 dropped_lock = true;
 158
 159                 task_lock(current);
 160                 current->files = NULL;
 161                 current->nsproxy = worker->restore_nsproxy;
 162                 task_unlock(current);
 163         }
 164
 165         if (current->fs)
 166                 current->fs = NULL;
 167
 168         /*
 169          * If we have an active mm, we need to drop the wq lock before unusing
 170          * it. If we do, return true and let the caller retry the idle loop.
 171          */
 172         if (worker->mm) {
 173                 if (!dropped_lock) {
 174                         __acquire(&wqe->lock);
 175                         raw_spin_unlock_irq(&wqe->lock);
 176                         dropped_lock = true;
 177                 }
 178                 __set_current_state(TASK_RUNNING);
 179                 kthread_unuse_mm(worker->mm);
 180                 mmput(worker->mm);
 181                 worker->mm = NULL;
 182         }
 183
 184 #ifdef CONFIG_BLK_CGROUP
 185         if (worker->blkcg_css) {
 186                 kthread_associate_blkcg(NULL);
 187                 worker->blkcg_css = NULL;
 188         }
 189 #endif
 190         if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
 191                 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 192         return dropped_lock;
 193 }
 194
 195 static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
 196                                                    struct io_wq_work *work)
 197 {
 198         if (work->flags & IO_WQ_WORK_UNBOUND)
 199                 return &wqe->acct[IO_WQ_ACCT_UNBOUND];
 200
 201         return &wqe->acct[IO_WQ_ACCT_BOUND];
 202 }
 203
 204 static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
 205 {
 206         struct io_wqe *wqe = worker->wqe;
 207
 208         if (worker->flags & IO_WORKER_F_BOUND)
 209                 return &wqe->acct[IO_WQ_ACCT_BOUND];
 210
 211         return &wqe->acct[IO_WQ_ACCT_UNBOUND];
 212 }
 213
 214 static void io_worker_exit(struct io_worker *worker)
 215 {
 216         struct io_wqe *wqe = worker->wqe;
 217         struct io_wqe_acct *acct = io_wqe_get_acct(worker);
 218
 219         /*
 220          * If we're not at zero, someone else is holding a brief reference
 221          * to the worker. Wait for that to go away.
 222          */
 223         set_current_state(TASK_INTERRUPTIBLE);
 224         if (!refcount_dec_and_test(&worker->ref))
 225                 schedule();
 226         __set_current_state(TASK_RUNNING);
 227
 228         preempt_disable();
 229         current->flags &= ~PF_IO_WORKER;
 230         if (worker->flags & IO_WORKER_F_RUNNING)
 231                 atomic_dec(&acct->nr_running);
 232         if (!(worker->flags & IO_WORKER_F_BOUND))
 233                 atomic_dec(&wqe->wq->user->processes);
 234         worker->flags = 0;
 235         preempt_enable();
 236
 237         raw_spin_lock_irq(&wqe->lock);
 238         hlist_nulls_del_rcu(&worker->nulls_node);
 239         list_del_rcu(&worker->all_list);
 240         if (__io_worker_unuse(wqe, worker)) {
 241                 __release(&wqe->lock);
 242                 raw_spin_lock_irq(&wqe->lock);
 243         }
 244         acct->nr_workers--;
 245         raw_spin_unlock_irq(&wqe->lock);
 246
 247         kfree_rcu(worker, rcu);
 248         if (refcount_dec_and_test(&wqe->wq->refs))
 249                 complete(&wqe->wq->done);
 250 }
 251
 252 static inline bool io_wqe_run_queue(struct io_wqe *wqe)
 253         __must_hold(wqe->lock)
 254 {
 255         if (!wq_list_empty(&wqe->work_list) &&
 256             !(wqe->flags & IO_WQE_FLAG_STALLED))
 257                 return true;
 258         return false;
 259 }
 260
 261 /*
 262  * Check head of free list for an available worker. If one isn't available,
 263  * caller must wake up the wq manager to create one.
 264  */
 265 static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 266         __must_hold(RCU)
 267 {
 268         struct hlist_nulls_node *n;
 269         struct io_worker *worker;
 270
 271         n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
 272         if (is_a_nulls(n))
 273                 return false;
 274
 275         worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
 276         if (io_worker_get(worker)) {
 277                 wake_up_process(worker->task);
 278                 io_worker_release(worker);
 279                 return true;
 280         }
 281
 282         return false;
 283 }
 284
 285 /*
 286  * We need a worker. If we find a free one, we're good. If not, and we're
 287  * below the max number of workers, wake up the manager to create one.
 288  */
 289 static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 290 {
 291         bool ret;
 292
 293         /*
 294          * Most likely an attempt to queue unbounded work on an io_wq that
 295          * wasn't setup with any unbounded workers.
 296          */
 297         WARN_ON_ONCE(!acct->max_workers);
 298
 299         rcu_read_lock();
 300         ret = io_wqe_activate_free_worker(wqe);
 301         rcu_read_unlock();
 302
 303         if (!ret && acct->nr_workers < acct->max_workers)
 304                 wake_up_process(wqe->wq->manager);
 305 }
 306
 307 static void io_wqe_inc_running(struct io_worker *worker)
 308 {
 309         struct io_wqe_acct *acct = io_wqe_get_acct(worker);
 310
 311         atomic_inc(&acct->nr_running);
 312 }
 313
 314 static void io_wqe_dec_running(struct io_worker *worker)
 315         __must_hold(wqe->lock)
 316 {
 317         struct io_wqe_acct *acct = io_wqe_get_acct(worker);
 318         struct io_wqe *wqe = worker->wqe;
 319
 320         if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
 321                 io_wqe_wake_worker(wqe, acct);
 322 }
 323
 324 static void io_worker_start(struct io_worker *worker)
 325 {
 326         allow_kernel_signal(SIGINT);
 327
 328         current->flags |= PF_IO_WORKER;
 329         current->fs = NULL;
 330         current->files = NULL;
 331
 332         worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
 333         worker->restore_nsproxy = current->nsproxy;
 334         io_wqe_inc_running(worker);
 335 }
 336
 337 /*
 338  * Worker will start processing some work. Move it to the busy list, if
 339  * it's currently on the freelist
 340  */
 341 static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 342                              struct io_wq_work *work)
 343         __must_hold(wqe->lock)
 344 {
 345         bool worker_bound, work_bound;
 346
 347         if (worker->flags & IO_WORKER_F_FREE) {
 348                 worker->flags &= ~IO_WORKER_F_FREE;
 349                 hlist_nulls_del_init_rcu(&worker->nulls_node);
 350         }
 351
 352         /*
 353          * If worker is moving from bound to unbound (or vice versa), then
 354          * ensure we update the running accounting.
 355          */
 356         worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
 357         work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
 358         if (worker_bound != work_bound) {
 359                 io_wqe_dec_running(worker);
 360                 if (work_bound) {
 361                         worker->flags |= IO_WORKER_F_BOUND;
 362                         wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
 363                         wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
 364                         atomic_dec(&wqe->wq->user->processes);
 365                 } else {
 366                         worker->flags &= ~IO_WORKER_F_BOUND;
 367                         wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
 368                         wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
 369                         atomic_inc(&wqe->wq->user->processes);
 370                 }
 371                 io_wqe_inc_running(worker);
 372          }
 373 }
 374
 375 /*
 376  * No work, worker going to sleep. Move to freelist, and unuse mm if we
 377  * have one attached. Dropping the mm may potentially sleep, so we drop
 378  * the lock in that case and return success. Since the caller has to
 379  * retry the loop in that case (we changed task state), we don't regrab
 380  * the lock if we return success.
 381  */
 382 static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 383         __must_hold(wqe->lock)
 384 {
 385         if (!(worker->flags & IO_WORKER_F_FREE)) {
 386                 worker->flags |= IO_WORKER_F_FREE;
 387                 hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 388         }
 389
 390         return __io_worker_unuse(wqe, worker);
 391 }
 392
 393 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 394 {
 395         return work->flags >> IO_WQ_HASH_SHIFT;
 396 }
 397
 398 static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 399         __must_hold(wqe->lock)
 400 {
 401         struct io_wq_work_node *node, *prev;
 402         struct io_wq_work *work, *tail;
 403         unsigned int hash;
 404
 405         wq_list_for_each(node, prev, &wqe->work_list) {
 406                 work = container_of(node, struct io_wq_work, list);
 407
 408                 /* not hashed, can run anytime */
 409                 if (!io_wq_is_hashed(work)) {
 410                         wq_list_del(&wqe->work_list, node, prev);
 411                         return work;
 412                 }
 413
 414                 /* hashed, can run if not already running */
 415                 hash = io_get_work_hash(work);
 416                 if (!(wqe->hash_map & BIT(hash))) {
 417                         wqe->hash_map |= BIT(hash);
 418                         /* all items with this hash lie in [work, tail] */
 419                         tail = wqe->hash_tail[hash];
 420                         wqe->hash_tail[hash] = NULL;
 421                         wq_list_cut(&wqe->work_list, &tail->list, prev);
 422                         return work;
 423                 }
 424         }
 425
 426         return NULL;
 427 }
 428
 429 static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
 430 {
 431         if (worker->mm) {
 432                 kthread_unuse_mm(worker->mm);
 433                 mmput(worker->mm);
 434                 worker->mm = NULL;
 435         }
 436
 437         if (mmget_not_zero(work->identity->mm)) {
 438                 kthread_use_mm(work->identity->mm);
 439                 worker->mm = work->identity->mm;
 440                 return;
 441         }
 442
 443         /* failed grabbing mm, ensure work gets cancelled */
 444         work->flags |= IO_WQ_WORK_CANCEL;
 445 }
 446
 447 static inline void io_wq_switch_blkcg(struct io_worker *worker,
 448                                       struct io_wq_work *work)
 449 {
 450 #ifdef CONFIG_BLK_CGROUP
 451         if (!(work->flags & IO_WQ_WORK_BLKCG))
 452                 return;
 453         if (work->identity->blkcg_css != worker->blkcg_css) {
 454                 kthread_associate_blkcg(work->identity->blkcg_css);
 455                 worker->blkcg_css = work->identity->blkcg_css;
 456         }
 457 #endif
 458 }
 459
 460 static void io_wq_switch_creds(struct io_worker *worker,
 461                                struct io_wq_work *work)
 462 {
 463         const struct cred *old_creds = override_creds(work->identity->creds);
 464
 465         worker->cur_creds = work->identity->creds;
 466         if (worker->saved_creds)
 467                 put_cred(old_creds); /* creds set by previous switch */
 468         else
 469                 worker->saved_creds = old_creds;
 470 }
 471
 472 static void io_impersonate_work(struct io_worker *worker,
 473                                 struct io_wq_work *work)
 474 {
 475         if ((work->flags & IO_WQ_WORK_FILES) &&
 476             current->files != work->identity->files) {
 477                 task_lock(current);
 478                 current->files = work->identity->files;
 479                 current->nsproxy = work->identity->nsproxy;
 480                 task_unlock(current);
 481                 if (!work->identity->files) {
 482                         /* failed grabbing files, ensure work gets cancelled */
 483                         work->flags |= IO_WQ_WORK_CANCEL;
 484                 }
 485         }
 486         if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
 487                 current->fs = work->identity->fs;
 488         if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
 489                 io_wq_switch_mm(worker, work);
 490         if ((work->flags & IO_WQ_WORK_CREDS) &&
 491             worker->cur_creds != work->identity->creds)
 492                 io_wq_switch_creds(worker, work);
 493         if (work->flags & IO_WQ_WORK_FSIZE)
 494                 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
 495         else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
 496                 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 497         io_wq_switch_blkcg(worker, work);
 498 #ifdef CONFIG_AUDIT
 499         current->loginuid = work->identity->loginuid;
 500         current->sessionid = work->identity->sessionid;
 501 #endif
 502 }
 503
 504 static void io_assign_current_work(struct io_worker *worker,
 505                                    struct io_wq_work *work)
 506 {
 507         if (work) {
 508                 /* flush pending signals before assigning new work */
 509                 if (signal_pending(current))
 510                         flush_signals(current);
 511                 cond_resched();
 512         }
 513
 514 #ifdef CONFIG_AUDIT
 515         current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
 516         current->sessionid = AUDIT_SID_UNSET;
 517 #endif
 518
 519         spin_lock_irq(&worker->lock);
 520         worker->cur_work = work;
 521         spin_unlock_irq(&worker->lock);
 522 }
 523
 524 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
 525
 526 static void io_worker_handle_work(struct io_worker *worker)
 527         __releases(wqe->lock)
 528 {
 529         struct io_wqe *wqe = worker->wqe;
 530         struct io_wq *wq = wqe->wq;
 531
 532         do {
 533                 struct io_wq_work *work;
 534 get_next:
 535                 /*
 536                  * If we got some work, mark us as busy. If we didn't, but
 537                  * the list isn't empty, it means we stalled on hashed work.
 538                  * Mark us stalled so we don't keep looking for work when we
 539                  * can't make progress, any work completion or insertion will
 540                  * clear the stalled flag.
 541                  */
 542                 work = io_get_next_work(wqe);
 543                 if (work)
 544                         __io_worker_busy(wqe, worker, work);
 545                 else if (!wq_list_empty(&wqe->work_list))
 546                         wqe->flags |= IO_WQE_FLAG_STALLED;
 547
 548                 raw_spin_unlock_irq(&wqe->lock);
 549                 if (!work)
 550                         break;
 551                 io_assign_current_work(worker, work);
 552
 553                 /* handle a whole dependent link */
 554                 do {
 555                         struct io_wq_work *next_hashed, *linked;
 556                         unsigned int hash = io_get_work_hash(work);
 557
 558                         next_hashed = wq_next_work(work);
 559                         io_impersonate_work(worker, work);
 560                         wq->do_work(work);
 561                         io_assign_current_work(worker, NULL);
 562
 563                         linked = wq->free_work(work);
 564                         work = next_hashed;
 565                         if (!work && linked && !io_wq_is_hashed(linked)) {
 566                                 work = linked;
 567                                 linked = NULL;
 568                         }
 569                         io_assign_current_work(worker, work);
 570                         if (linked)
 571                                 io_wqe_enqueue(wqe, linked);
 572
 573                         if (hash != -1U && !next_hashed) {
 574                                 raw_spin_lock_irq(&wqe->lock);
 575                                 wqe->hash_map &= ~BIT_ULL(hash);
 576                                 wqe->flags &= ~IO_WQE_FLAG_STALLED;
 577                                 /* skip unnecessary unlock-lock wqe->lock */
 578                                 if (!work)
 579                                         goto get_next;
 580                                 raw_spin_unlock_irq(&wqe->lock);
 581                         }
 582                 } while (work);
 583
 584                 raw_spin_lock_irq(&wqe->lock);
 585         } while (1);
 586 }
 587
 588 static int io_wqe_worker(void *data)
 589 {
 590         struct io_worker *worker = data;
 591         struct io_wqe *wqe = worker->wqe;
 592         struct io_wq *wq = wqe->wq;
 593
 594         io_worker_start(worker);
 595
 596         while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 597                 set_current_state(TASK_INTERRUPTIBLE);
 598 loop:
 599                 raw_spin_lock_irq(&wqe->lock);
 600                 if (io_wqe_run_queue(wqe)) {
 601                         __set_current_state(TASK_RUNNING);
 602                         io_worker_handle_work(worker);
 603                         goto loop;
 604                 }
 605                 /* drops the lock on success, retry */
 606                 if (__io_worker_idle(wqe, worker)) {
 607                         __release(&wqe->lock);
 608                         goto loop;
 609                 }
 610                 raw_spin_unlock_irq(&wqe->lock);
 611                 if (signal_pending(current))
 612                         flush_signals(current);
 613                 if (schedule_timeout(WORKER_IDLE_TIMEOUT))
 614                         continue;
 615                 /* timed out, exit unless we're the fixed worker */
 616                 if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
 617                     !(worker->flags & IO_WORKER_F_FIXED))
 618                         break;
 619         }
 620
 621         if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 622                 raw_spin_lock_irq(&wqe->lock);
 623                 if (!wq_list_empty(&wqe->work_list))
 624                         io_worker_handle_work(worker);
 625                 else
 626                         raw_spin_unlock_irq(&wqe->lock);
 627         }
 628
 629         io_worker_exit(worker);
 630         return 0;
 631 }
 632
 633 /*
 634  * Called when a worker is scheduled in. Mark us as currently running.
 635  */
 636 void io_wq_worker_running(struct task_struct *tsk)
 637 {
 638         struct io_worker *worker = kthread_data(tsk);
 639
 640         if (!(worker->flags & IO_WORKER_F_UP))
 641                 return;
 642         if (worker->flags & IO_WORKER_F_RUNNING)
 643                 return;
 644         worker->flags |= IO_WORKER_F_RUNNING;
 645         io_wqe_inc_running(worker);
 646 }
 647
 648 /*
 649  * Called when worker is going to sleep. If there are no workers currently
 650  * running and we have work pending, wake up a free one or have the manager
 651  * set one up.
 652  */
 653 void io_wq_worker_sleeping(struct task_struct *tsk)
 654 {
 655         struct io_worker *worker = kthread_data(tsk);
 656         struct io_wqe *wqe = worker->wqe;
 657
 658         if (!(worker->flags & IO_WORKER_F_UP))
 659                 return;
 660         if (!(worker->flags & IO_WORKER_F_RUNNING))
 661                 return;
 662
 663         worker->flags &= ~IO_WORKER_F_RUNNING;
 664
 665         raw_spin_lock_irq(&wqe->lock);
 666         io_wqe_dec_running(worker);
 667         raw_spin_unlock_irq(&wqe->lock);
 668 }
 669
 670 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 671 {
 672         struct io_wqe_acct *acct = &wqe->acct[index];
 673         struct io_worker *worker;
 674
 675         worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
 676         if (!worker)
 677                 return false;
 678
 679         refcount_set(&worker->ref, 1);
 680         worker->nulls_node.pprev = NULL;
 681         worker->wqe = wqe;
 682         spin_lock_init(&worker->lock);
 683
 684         worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
 685                                 "io_wqe_worker-%d/%d", index, wqe->node);
 686         if (IS_ERR(worker->task)) {
 687                 kfree(worker);
 688                 return false;
 689         }
 690         kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
 691
 692         raw_spin_lock_irq(&wqe->lock);
 693         hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 694         list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 695         worker->flags |= IO_WORKER_F_FREE;
 696         if (index == IO_WQ_ACCT_BOUND)
 697                 worker->flags |= IO_WORKER_F_BOUND;
 698         if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
 699                 worker->flags |= IO_WORKER_F_FIXED;
 700         acct->nr_workers++;
 701         raw_spin_unlock_irq(&wqe->lock);
 702
 703         if (index == IO_WQ_ACCT_UNBOUND)
 704                 atomic_inc(&wq->user->processes);
 705
 706         refcount_inc(&wq->refs);
 707         wake_up_process(worker->task);
 708         return true;
 709 }
 710
 711 static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 712         __must_hold(wqe->lock)
 713 {
 714         struct io_wqe_acct *acct = &wqe->acct[index];
 715
 716         /* if we have available workers or no work, no need */
 717         if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
 718                 return false;
 719         return acct->nr_workers < acct->max_workers;
 720 }
 721
 722 /*
 723  * Iterate the passed in list and call the specific function for each
 724  * worker that isn't exiting
 725  */
 726 static bool io_wq_for_each_worker(struct io_wqe *wqe,
 727                                   bool (*func)(struct io_worker *, void *),
 728                                   void *data)
 729 {
 730         struct io_worker *worker;
 731         bool ret = false;
 732
 733         list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
 734                 if (io_worker_get(worker)) {
 735                         /* no task if node is/was offline */
 736                         if (worker->task)
 737                                 ret = func(worker, data);
 738                         io_worker_release(worker);
 739                         if (ret)
 740                                 break;
 741                 }
 742         }
 743
 744         return ret;
 745 }
 746
 747 static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 748 {
 749         wake_up_process(worker->task);
 750         return false;
 751 }
 752
 753 /*
 754  * Manager thread. Tasked with creating new workers, if we need them.
 755  */
 756 static int io_wq_manager(void *data)
 757 {
 758         struct io_wq *wq = data;
 759         int node;
 760
 761         refcount_set(&wq->refs, 1);
 762         complete(&wq->done);
 763
 764         while (!kthread_should_stop()) {
 765                 for_each_node(node) {
 766                         struct io_wqe *wqe = wq->wqes[node];
 767                         bool fork_worker[2] = { false, false };
 768
 769                         if (!node_online(node))
 770                                 continue;
 771
 772                         raw_spin_lock_irq(&wqe->lock);
 773                         if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
 774                                 fork_worker[IO_WQ_ACCT_BOUND] = true;
 775                         if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
 776                                 fork_worker[IO_WQ_ACCT_UNBOUND] = true;
 777                         raw_spin_unlock_irq(&wqe->lock);
 778                         if (fork_worker[IO_WQ_ACCT_BOUND])
 779                                 create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
 780                         if (fork_worker[IO_WQ_ACCT_UNBOUND])
 781                                 create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
 782                 }
 783                 set_current_state(TASK_INTERRUPTIBLE);
 784                 schedule_timeout(HZ);
 785         }
 786
 787         if (refcount_dec_and_test(&wq->refs)) {
 788                 complete(&wq->done);
 789                 return 0;
 790         }
 791         /* if ERROR is set and we get here, we have workers to wake */
 792         if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
 793                 rcu_read_lock();
 794                 for_each_node(node)
 795                         io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
 796                 rcu_read_unlock();
 797         }
 798         return 0;
 799 }
 800
 801 static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
 802                             struct io_wq_work *work)
 803 {
 804         bool free_worker;
 805
 806         if (!(work->flags & IO_WQ_WORK_UNBOUND))
 807                 return true;
 808         if (atomic_read(&acct->nr_running))
 809                 return true;
 810
 811         rcu_read_lock();
 812         free_worker = !hlist_nulls_empty(&wqe->free_list);
 813         rcu_read_unlock();
 814         if (free_worker)
 815                 return true;
 816
 817         if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
 818             !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
 819                 return false;
 820
 821         return true;
 822 }
 823
 824 static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 825 {
 826         struct io_wq *wq = wqe->wq;
 827
 828         do {
 829                 work->flags |= IO_WQ_WORK_CANCEL;
 830                 wq->do_work(work);
 831                 work = wq->free_work(work);
 832         } while (work);
 833 }
 834
 835 static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
 836 {
 837         unsigned int hash;
 838         struct io_wq_work *tail;
 839
 840         if (!io_wq_is_hashed(work)) {
 841 append:
 842                 wq_list_add_tail(&work->list, &wqe->work_list);
 843                 return;
 844         }
 845
 846         hash = io_get_work_hash(work);
 847         tail = wqe->hash_tail[hash];
 848         wqe->hash_tail[hash] = work;
 849         if (!tail)
 850                 goto append;
 851
 852         wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
 853 }
 854
 855 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 856 {
 857         struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
 858         int work_flags;
 859         unsigned long flags;
 860
 861         /*
 862          * Do early check to see if we need a new unbound worker, and if we do,
 863          * if we're allowed to do so. This isn't 100% accurate as there's a
 864          * gap between this check and incrementing the value, but that's OK.
 865          * It's close enough to not be an issue, fork() has the same delay.
 866          */
 867         if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
 868                 io_run_cancel(work, wqe);
 869                 return;
 870         }
 871
 872         work_flags = work->flags;
 873         raw_spin_lock_irqsave(&wqe->lock, flags);
 874         io_wqe_insert_work(wqe, work);
 875         wqe->flags &= ~IO_WQE_FLAG_STALLED;
 876         raw_spin_unlock_irqrestore(&wqe->lock, flags);
 877
 878         if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
 879             !atomic_read(&acct->nr_running))
 880                 io_wqe_wake_worker(wqe, acct);
 881 }
 882
 883 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 884 {
 885         struct io_wqe *wqe = wq->wqes[numa_node_id()];
 886
 887         io_wqe_enqueue(wqe, work);
 888 }
 889
 890 /*
 891  * Work items that hash to the same value will not be done in parallel.
 892  * Used to limit concurrent writes, generally hashed by inode.
 893  */
 894 void io_wq_hash_work(struct io_wq_work *work, void *val)
 895 {
 896         unsigned int bit;
 897
 898         bit = hash_ptr(val, IO_WQ_HASH_ORDER);
 899         work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
 900 }
 901
 902 struct io_cb_cancel_data {
 903         work_cancel_fn *fn;
 904         void *data;
 905         int nr_running;
 906         int nr_pending;
 907         bool cancel_all;
 908 };
 909
 910 static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 911 {
 912         struct io_cb_cancel_data *match = data;
 913         unsigned long flags;
 914
 915         /*
 916          * Hold the lock to avoid ->cur_work going out of scope, caller
 917          * may dereference the passed in work.
 918          */
 919         spin_lock_irqsave(&worker->lock, flags);
 920         if (worker->cur_work &&
 921             match->fn(worker->cur_work, match->data)) {
 922                 send_sig(SIGINT, worker->task, 1);
 923                 match->nr_running++;
 924         }
 925         spin_unlock_irqrestore(&worker->lock, flags);
 926
 927         return match->nr_running && !match->cancel_all;
 928 }
 929
 930 static inline void io_wqe_remove_pending(struct io_wqe *wqe,
 931                                          struct io_wq_work *work,
 932                                          struct io_wq_work_node *prev)
 933 {
 934         unsigned int hash = io_get_work_hash(work);
 935         struct io_wq_work *prev_work = NULL;
 936
 937         if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
 938                 if (prev)
 939                         prev_work = container_of(prev, struct io_wq_work, list);
 940                 if (prev_work && io_get_work_hash(prev_work) == hash)
 941                         wqe->hash_tail[hash] = prev_work;
 942                 else
 943                         wqe->hash_tail[hash] = NULL;
 944         }
 945         wq_list_del(&wqe->work_list, &work->list, prev);
 946 }
 947
 948 static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
 949                                        struct io_cb_cancel_data *match)
 950 {
 951         struct io_wq_work_node *node, *prev;
 952         struct io_wq_work *work;
 953         unsigned long flags;
 954
 955 retry:
 956         raw_spin_lock_irqsave(&wqe->lock, flags);
 957         wq_list_for_each(node, prev, &wqe->work_list) {
 958                 work = container_of(node, struct io_wq_work, list);
 959                 if (!match->fn(work, match->data))
 960                         continue;
 961                 io_wqe_remove_pending(wqe, work, prev);
 962                 raw_spin_unlock_irqrestore(&wqe->lock, flags);
 963                 io_run_cancel(work, wqe);
 964                 match->nr_pending++;
 965                 if (!match->cancel_all)
 966                         return;
 967
 968                 /* not safe to continue after unlock */
 969                 goto retry;
 970         }
 971         raw_spin_unlock_irqrestore(&wqe->lock, flags);
 972 }
 973
 974 static void io_wqe_cancel_running_work(struct io_wqe *wqe,
 975                                        struct io_cb_cancel_data *match)
 976 {
 977         rcu_read_lock();
 978         io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
 979         rcu_read_unlock();
 980 }
 981
 982 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 983                                   void *data, bool cancel_all)
 984 {
 985         struct io_cb_cancel_data match = {
 986                 .fn             = cancel,
 987                 .data           = data,
 988                 .cancel_all     = cancel_all,
 989         };
 990         int node;
 991
 992         /*
 993          * First check pending list, if we're lucky we can just remove it
 994          * from there. CANCEL_OK means that the work is returned as-new,
 995          * no completion will be posted for it.
 996          */
 997         for_each_node(node) {
 998                 struct io_wqe *wqe = wq->wqes[node];
 999
1000                 io_wqe_cancel_pending_work(wqe, &match);
1001                 if (match.nr_pending && !match.cancel_all)
1002                         return IO_WQ_CANCEL_OK;
1003         }
1004
1005         /*
1006          * Now check if a free (going busy) or busy worker has the work
1007          * currently running. If we find it there, we'll return CANCEL_RUNNING
1008          * as an indication that we attempt to signal cancellation. The
1009          * completion will run normally in this case.
1010          */
1011         for_each_node(node) {
1012                 struct io_wqe *wqe = wq->wqes[node];
1013
1014                 io_wqe_cancel_running_work(wqe, &match);
1015                 if (match.nr_running && !match.cancel_all)
1016                         return IO_WQ_CANCEL_RUNNING;
1017         }
1018
1019         if (match.nr_running)
1020                 return IO_WQ_CANCEL_RUNNING;
1021         if (match.nr_pending)
1022                 return IO_WQ_CANCEL_OK;
1023         return IO_WQ_CANCEL_NOTFOUND;
1024 }
1025
1026 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
1027 {
1028         int ret = -ENOMEM, node;
1029         struct io_wq *wq;
1030
1031         if (WARN_ON_ONCE(!data->free_work || !data->do_work))
1032                 return ERR_PTR(-EINVAL);
1033
1034         wq = kzalloc(sizeof(*wq), GFP_KERNEL);
1035         if (!wq)
1036                 return ERR_PTR(-ENOMEM);
1037
1038         wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
1039         if (!wq->wqes)
1040                 goto err_wq;
1041
1042         ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
1043         if (ret)
1044                 goto err_wqes;
1045
1046         wq->free_work = data->free_work;
1047         wq->do_work = data->do_work;
1048
1049         /* caller must already hold a reference to this */
1050         wq->user = data->user;
1051
1052         ret = -ENOMEM;
1053         for_each_node(node) {
1054                 struct io_wqe *wqe;
1055                 int alloc_node = node;
1056
1057                 if (!node_online(alloc_node))
1058                         alloc_node = NUMA_NO_NODE;
1059                 wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
1060                 if (!wqe)
1061                         goto err;
1062                 wq->wqes[node] = wqe;
1063                 wqe->node = alloc_node;
1064                 wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
1065                 atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
1066                 if (wq->user) {
1067                         wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
1068                                         task_rlimit(current, RLIMIT_NPROC);
1069                 }
1070                 atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
1071                 wqe->wq = wq;
1072                 raw_spin_lock_init(&wqe->lock);
1073                 INIT_WQ_LIST(&wqe->work_list);
1074                 INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
1075                 INIT_LIST_HEAD(&wqe->all_list);
1076         }
1077
1078         init_completion(&wq->done);
1079
1080         wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
1081         if (!IS_ERR(wq->manager)) {
1082                 wake_up_process(wq->manager);
1083                 wait_for_completion(&wq->done);
1084                 if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
1085                         ret = -ENOMEM;
1086                         goto err;
1087                 }
1088                 reinit_completion(&wq->done);
1089                 return wq;
1090         }
1091
1092         ret = PTR_ERR(wq->manager);
1093         complete(&wq->done);
1094 err:
1095         cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
1096         for_each_node(node)
1097                 kfree(wq->wqes[node]);
1098 err_wqes:
1099         kfree(wq->wqes);
1100 err_wq:
1101         kfree(wq);
1102         return ERR_PTR(ret);
1103 }
1104
1105 void io_wq_destroy(struct io_wq *wq)
1106 {
1107         int node;
1108
1109         cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
1110
1111         set_bit(IO_WQ_BIT_EXIT, &wq->state);
1112         if (wq->manager)
1113                 kthread_stop(wq->manager);
1114
1115         rcu_read_lock();
1116         for_each_node(node)
1117                 io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
1118         rcu_read_unlock();
1119
1120         wait_for_completion(&wq->done);
1121
1122         for_each_node(node)
1123                 kfree(wq->wqes[node]);
1124         kfree(wq->wqes);
1125         kfree(wq);
1126 }
1127
1128 static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
1129 {
1130         struct task_struct *task = worker->task;
1131         struct rq_flags rf;
1132         struct rq *rq;
1133
1134         rq = task_rq_lock(task, &rf);
1135         do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
1136         task->flags |= PF_NO_SETAFFINITY;
1137         task_rq_unlock(rq, task, &rf);
1138         return false;
1139 }
1140
1141 static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
1142 {
1143         struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
1144         int i;
1145
1146         rcu_read_lock();
1147         for_each_node(i)
1148                 io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
1149         rcu_read_unlock();
1150         return 0;
1151 }
1152
1153 static __init int io_wq_init(void)
1154 {
1155         int ret;
1156
1157         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
1158                                         io_wq_cpu_online, NULL);
1159         if (ret < 0)
1160                 return ret;
1161         io_wq_online = ret;
1162         return 0;
1163 }
1164 subsys_initcall(io_wq_init);