fs/io-wq.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Basic worker thread pool for io_uring
   4  *
   5  * Copyright (C) 2019 Jens Axboe
   6  *
   7  */
   8 #include <linux/kernel.h>
   9 #include <linux/init.h>
  10 #include <linux/errno.h>
  11 #include <linux/sched/signal.h>
  12 #include <linux/mm.h>
  13 #include <linux/sched/mm.h>
  14 #include <linux/percpu.h>
  15 #include <linux/slab.h>
  16 #include <linux/kthread.h>
  17 #include <linux/rculist_nulls.h>
  18 #include <linux/fs_struct.h>
  19 #include <linux/blk-cgroup.h>
  20 #include <linux/audit.h>
  21 #include <linux/cpu.h>
  22
  23 #include "../kernel/sched/sched.h"
  24 #include "io-wq.h"
  25
  26 #define WORKER_IDLE_TIMEOUT     (5 * HZ)
  27
  28 enum {
  29         IO_WORKER_F_UP          = 1,    /* up and active */
  30         IO_WORKER_F_RUNNING     = 2,    /* account as running */
  31         IO_WORKER_F_FREE        = 4,    /* worker on free list */
  32         IO_WORKER_F_FIXED       = 8,    /* static idle worker */
  33         IO_WORKER_F_BOUND       = 16,   /* is doing bounded work */
  34 };
  35
  36 enum {
  37         IO_WQ_BIT_EXIT          = 0,    /* wq exiting */
  38         IO_WQ_BIT_ERROR         = 1,    /* error on setup */
  39 };
  40
  41 enum {
  42         IO_WQE_FLAG_STALLED     = 1,    /* stalled on hash */
  43 };
  44
  45 /*
  46  * One for each thread in a wqe pool
  47  */
  48 struct io_worker {
  49         refcount_t ref;
  50         unsigned flags;
  51         struct hlist_nulls_node nulls_node;
  52         struct list_head all_list;
  53         struct task_struct *task;
  54         struct io_wqe *wqe;
  55
  56         struct io_wq_work *cur_work;
  57         spinlock_t lock;
  58
  59         struct rcu_head rcu;
  60         struct mm_struct *mm;
  61 #ifdef CONFIG_BLK_CGROUP
  62         struct cgroup_subsys_state *blkcg_css;
  63 #endif
  64         const struct cred *cur_creds;
  65         const struct cred *saved_creds;
  66         struct nsproxy *restore_nsproxy;
  67 };
  68
  69 #if BITS_PER_LONG == 64
  70 #define IO_WQ_HASH_ORDER        6
  71 #else
  72 #define IO_WQ_HASH_ORDER        5
  73 #endif
  74
  75 #define IO_WQ_NR_HASH_BUCKETS   (1u << IO_WQ_HASH_ORDER)
  76
  77 struct io_wqe_acct {
  78         unsigned nr_workers;
  79         unsigned max_workers;
  80         atomic_t nr_running;
  81 };
  82
  83 enum {
  84         IO_WQ_ACCT_BOUND,
  85         IO_WQ_ACCT_UNBOUND,
  86 };
  87
  88 /*
  89  * Per-node worker thread pool
  90  */
  91 struct io_wqe {
  92         struct {
  93                 raw_spinlock_t lock;
  94                 struct io_wq_work_list work_list;
  95                 unsigned long hash_map;
  96                 unsigned flags;
  97         } ____cacheline_aligned_in_smp;
  98
  99         int node;
 100         struct io_wqe_acct acct[2];
 101
 102         struct hlist_nulls_head free_list;
 103         struct list_head all_list;
 104
 105         struct io_wq *wq;
 106         struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
 107 };
 108
 109 /*
 110  * Per io_wq state
 111   */
 112 struct io_wq {
 113         struct io_wqe **wqes;
 114         unsigned long state;
 115
 116         free_work_fn *free_work;
 117         io_wq_work_fn *do_work;
 118
 119         struct task_struct *manager;
 120         struct user_struct *user;
 121         refcount_t refs;
 122         struct completion done;
 123
 124         struct hlist_node cpuhp_node;
 125
 126         refcount_t use_refs;
 127 };
 128
 129 static enum cpuhp_state io_wq_online;
 130
 131 static bool io_worker_get(struct io_worker *worker)
 132 {
 133         return refcount_inc_not_zero(&worker->ref);
 134 }
 135
 136 static void io_worker_release(struct io_worker *worker)
 137 {
 138         if (refcount_dec_and_test(&worker->ref))
 139                 wake_up_process(worker->task);
 140 }
 141
 142 /*
 143  * Note: drops the wqe->lock if returning true! The caller must re-acquire
 144  * the lock in that case. Some callers need to restart handling if this
 145  * happens, so we can't just re-acquire the lock on behalf of the caller.
 146  */
 147 static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 148 {
 149         bool dropped_lock = false;
 150
 151         if (worker->saved_creds) {
 152                 revert_creds(worker->saved_creds);
 153                 worker->cur_creds = worker->saved_creds = NULL;
 154         }
 155
 156         if (current->files) {
 157                 __acquire(&wqe->lock);
 158                 raw_spin_unlock_irq(&wqe->lock);
 159                 dropped_lock = true;
 160
 161                 task_lock(current);
 162                 current->files = NULL;
 163                 current->nsproxy = worker->restore_nsproxy;
 164                 task_unlock(current);
 165         }
 166
 167         if (current->fs)
 168                 current->fs = NULL;
 169
 170         /*
 171          * If we have an active mm, we need to drop the wq lock before unusing
 172          * it. If we do, return true and let the caller retry the idle loop.
 173          */
 174         if (worker->mm) {
 175                 if (!dropped_lock) {
 176                         __acquire(&wqe->lock);
 177                         raw_spin_unlock_irq(&wqe->lock);
 178                         dropped_lock = true;
 179                 }
 180                 __set_current_state(TASK_RUNNING);
 181                 kthread_unuse_mm(worker->mm);
 182                 mmput(worker->mm);
 183                 worker->mm = NULL;
 184         }
 185
 186 #ifdef CONFIG_BLK_CGROUP
 187         if (worker->blkcg_css) {
 188                 kthread_associate_blkcg(NULL);
 189                 worker->blkcg_css = NULL;
 190         }
 191 #endif
 192         if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
 193                 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 194         return dropped_lock;
 195 }
 196
 197 static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
 198                                                    struct io_wq_work *work)
 199 {
 200         if (work->flags & IO_WQ_WORK_UNBOUND)
 201                 return &wqe->acct[IO_WQ_ACCT_UNBOUND];
 202
 203         return &wqe->acct[IO_WQ_ACCT_BOUND];
 204 }
 205
 206 static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
 207                                                   struct io_worker *worker)
 208 {
 209         if (worker->flags & IO_WORKER_F_BOUND)
 210                 return &wqe->acct[IO_WQ_ACCT_BOUND];
 211
 212         return &wqe->acct[IO_WQ_ACCT_UNBOUND];
 213 }
 214
 215 static void io_worker_exit(struct io_worker *worker)
 216 {
 217         struct io_wqe *wqe = worker->wqe;
 218         struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
 219
 220         /*
 221          * If we're not at zero, someone else is holding a brief reference
 222          * to the worker. Wait for that to go away.
 223          */
 224         set_current_state(TASK_INTERRUPTIBLE);
 225         if (!refcount_dec_and_test(&worker->ref))
 226                 schedule();
 227         __set_current_state(TASK_RUNNING);
 228
 229         preempt_disable();
 230         current->flags &= ~PF_IO_WORKER;
 231         if (worker->flags & IO_WORKER_F_RUNNING)
 232                 atomic_dec(&acct->nr_running);
 233         if (!(worker->flags & IO_WORKER_F_BOUND))
 234                 atomic_dec(&wqe->wq->user->processes);
 235         worker->flags = 0;
 236         preempt_enable();
 237
 238         raw_spin_lock_irq(&wqe->lock);
 239         hlist_nulls_del_rcu(&worker->nulls_node);
 240         list_del_rcu(&worker->all_list);
 241         if (__io_worker_unuse(wqe, worker)) {
 242                 __release(&wqe->lock);
 243                 raw_spin_lock_irq(&wqe->lock);
 244         }
 245         acct->nr_workers--;
 246         raw_spin_unlock_irq(&wqe->lock);
 247
 248         kfree_rcu(worker, rcu);
 249         if (refcount_dec_and_test(&wqe->wq->refs))
 250                 complete(&wqe->wq->done);
 251 }
 252
 253 static inline bool io_wqe_run_queue(struct io_wqe *wqe)
 254         __must_hold(wqe->lock)
 255 {
 256         if (!wq_list_empty(&wqe->work_list) &&
 257             !(wqe->flags & IO_WQE_FLAG_STALLED))
 258                 return true;
 259         return false;
 260 }
 261
 262 /*
 263  * Check head of free list for an available worker. If one isn't available,
 264  * caller must wake up the wq manager to create one.
 265  */
 266 static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 267         __must_hold(RCU)
 268 {
 269         struct hlist_nulls_node *n;
 270         struct io_worker *worker;
 271
 272         n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
 273         if (is_a_nulls(n))
 274                 return false;
 275
 276         worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
 277         if (io_worker_get(worker)) {
 278                 wake_up_process(worker->task);
 279                 io_worker_release(worker);
 280                 return true;
 281         }
 282
 283         return false;
 284 }
 285
 286 /*
 287  * We need a worker. If we find a free one, we're good. If not, and we're
 288  * below the max number of workers, wake up the manager to create one.
 289  */
 290 static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
 291 {
 292         bool ret;
 293
 294         /*
 295          * Most likely an attempt to queue unbounded work on an io_wq that
 296          * wasn't setup with any unbounded workers.
 297          */
 298         WARN_ON_ONCE(!acct->max_workers);
 299
 300         rcu_read_lock();
 301         ret = io_wqe_activate_free_worker(wqe);
 302         rcu_read_unlock();
 303
 304         if (!ret && acct->nr_workers < acct->max_workers)
 305                 wake_up_process(wqe->wq->manager);
 306 }
 307
 308 static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
 309 {
 310         struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
 311
 312         atomic_inc(&acct->nr_running);
 313 }
 314
 315 static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
 316         __must_hold(wqe->lock)
 317 {
 318         struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
 319
 320         if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
 321                 io_wqe_wake_worker(wqe, acct);
 322 }
 323
 324 static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
 325 {
 326         allow_kernel_signal(SIGINT);
 327
 328         current->flags |= PF_IO_WORKER;
 329         current->fs = NULL;
 330         current->files = NULL;
 331
 332         worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
 333         worker->restore_nsproxy = current->nsproxy;
 334         io_wqe_inc_running(wqe, worker);
 335 }
 336
 337 /*
 338  * Worker will start processing some work. Move it to the busy list, if
 339  * it's currently on the freelist
 340  */
 341 static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 342                              struct io_wq_work *work)
 343         __must_hold(wqe->lock)
 344 {
 345         bool worker_bound, work_bound;
 346
 347         if (worker->flags & IO_WORKER_F_FREE) {
 348                 worker->flags &= ~IO_WORKER_F_FREE;
 349                 hlist_nulls_del_init_rcu(&worker->nulls_node);
 350         }
 351
 352         /*
 353          * If worker is moving from bound to unbound (or vice versa), then
 354          * ensure we update the running accounting.
 355          */
 356         worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
 357         work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
 358         if (worker_bound != work_bound) {
 359                 io_wqe_dec_running(wqe, worker);
 360                 if (work_bound) {
 361                         worker->flags |= IO_WORKER_F_BOUND;
 362                         wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
 363                         wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
 364                         atomic_dec(&wqe->wq->user->processes);
 365                 } else {
 366                         worker->flags &= ~IO_WORKER_F_BOUND;
 367                         wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
 368                         wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
 369                         atomic_inc(&wqe->wq->user->processes);
 370                 }
 371                 io_wqe_inc_running(wqe, worker);
 372          }
 373 }
 374
 375 /*
 376  * No work, worker going to sleep. Move to freelist, and unuse mm if we
 377  * have one attached. Dropping the mm may potentially sleep, so we drop
 378  * the lock in that case and return success. Since the caller has to
 379  * retry the loop in that case (we changed task state), we don't regrab
 380  * the lock if we return success.
 381  */
 382 static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 383         __must_hold(wqe->lock)
 384 {
 385         if (!(worker->flags & IO_WORKER_F_FREE)) {
 386                 worker->flags |= IO_WORKER_F_FREE;
 387                 hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 388         }
 389
 390         return __io_worker_unuse(wqe, worker);
 391 }
 392
 393 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 394 {
 395         return work->flags >> IO_WQ_HASH_SHIFT;
 396 }
 397
 398 static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 399         __must_hold(wqe->lock)
 400 {
 401         struct io_wq_work_node *node, *prev;
 402         struct io_wq_work *work, *tail;
 403         unsigned int hash;
 404
 405         wq_list_for_each(node, prev, &wqe->work_list) {
 406                 work = container_of(node, struct io_wq_work, list);
 407
 408                 /* not hashed, can run anytime */
 409                 if (!io_wq_is_hashed(work)) {
 410                         wq_list_del(&wqe->work_list, node, prev);
 411                         return work;
 412                 }
 413
 414                 /* hashed, can run if not already running */
 415                 hash = io_get_work_hash(work);
 416                 if (!(wqe->hash_map & BIT(hash))) {
 417                         wqe->hash_map |= BIT(hash);
 418                         /* all items with this hash lie in [work, tail] */
 419                         tail = wqe->hash_tail[hash];
 420                         wqe->hash_tail[hash] = NULL;
 421                         wq_list_cut(&wqe->work_list, &tail->list, prev);
 422                         return work;
 423                 }
 424         }
 425
 426         return NULL;
 427 }
 428
 429 static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
 430 {
 431         if (worker->mm) {
 432                 kthread_unuse_mm(worker->mm);
 433                 mmput(worker->mm);
 434                 worker->mm = NULL;
 435         }
 436
 437         if (mmget_not_zero(work->identity->mm)) {
 438                 kthread_use_mm(work->identity->mm);
 439                 worker->mm = work->identity->mm;
 440                 return;
 441         }
 442
 443         /* failed grabbing mm, ensure work gets cancelled */
 444         work->flags |= IO_WQ_WORK_CANCEL;
 445 }
 446
 447 static inline void io_wq_switch_blkcg(struct io_worker *worker,
 448                                       struct io_wq_work *work)
 449 {
 450 #ifdef CONFIG_BLK_CGROUP
 451         if (!(work->flags & IO_WQ_WORK_BLKCG))
 452                 return;
 453         if (work->identity->blkcg_css != worker->blkcg_css) {
 454                 kthread_associate_blkcg(work->identity->blkcg_css);
 455                 worker->blkcg_css = work->identity->blkcg_css;
 456         }
 457 #endif
 458 }
 459
 460 static void io_wq_switch_creds(struct io_worker *worker,
 461                                struct io_wq_work *work)
 462 {
 463         const struct cred *old_creds = override_creds(work->identity->creds);
 464
 465         worker->cur_creds = work->identity->creds;
 466         if (worker->saved_creds)
 467                 put_cred(old_creds); /* creds set by previous switch */
 468         else
 469                 worker->saved_creds = old_creds;
 470 }
 471
 472 static void io_impersonate_work(struct io_worker *worker,
 473                                 struct io_wq_work *work)
 474 {
 475         if ((work->flags & IO_WQ_WORK_FILES) &&
 476             current->files != work->identity->files) {
 477                 task_lock(current);
 478                 current->files = work->identity->files;
 479                 current->nsproxy = work->identity->nsproxy;
 480                 task_unlock(current);
 481                 if (!work->identity->files) {
 482                         /* failed grabbing files, ensure work gets cancelled */
 483                         work->flags |= IO_WQ_WORK_CANCEL;
 484                 }
 485         }
 486         if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
 487                 current->fs = work->identity->fs;
 488         if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
 489                 io_wq_switch_mm(worker, work);
 490         if ((work->flags & IO_WQ_WORK_CREDS) &&
 491             worker->cur_creds != work->identity->creds)
 492                 io_wq_switch_creds(worker, work);
 493         if (work->flags & IO_WQ_WORK_FSIZE)
 494                 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
 495         else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY)
 496                 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 497         io_wq_switch_blkcg(worker, work);
 498 #ifdef CONFIG_AUDIT
 499         current->loginuid = work->identity->loginuid;
 500         current->sessionid = work->identity->sessionid;
 501 #endif
 502 }
 503
 504 static void io_assign_current_work(struct io_worker *worker,
 505                                    struct io_wq_work *work)
 506 {
 507         if (work) {
 508                 /* flush pending signals before assigning new work */
 509                 if (signal_pending(current))
 510                         flush_signals(current);
 511                 cond_resched();
 512         }
 513
 514 #ifdef CONFIG_AUDIT
 515         current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
 516         current->sessionid = AUDIT_SID_UNSET;
 517 #endif
 518
 519         spin_lock_irq(&worker->lock);
 520         worker->cur_work = work;
 521         spin_unlock_irq(&worker->lock);
 522 }
 523
 524 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
 525
 526 static void io_worker_handle_work(struct io_worker *worker)
 527         __releases(wqe->lock)
 528 {
 529         struct io_wqe *wqe = worker->wqe;
 530         struct io_wq *wq = wqe->wq;
 531
 532         do {
 533                 struct io_wq_work *work;
 534 get_next:
 535                 /*
 536                  * If we got some work, mark us as busy. If we didn't, but
 537                  * the list isn't empty, it means we stalled on hashed work.
 538                  * Mark us stalled so we don't keep looking for work when we
 539                  * can't make progress, any work completion or insertion will
 540                  * clear the stalled flag.
 541                  */
 542                 work = io_get_next_work(wqe);
 543                 if (work)
 544                         __io_worker_busy(wqe, worker, work);
 545                 else if (!wq_list_empty(&wqe->work_list))
 546                         wqe->flags |= IO_WQE_FLAG_STALLED;
 547
 548                 raw_spin_unlock_irq(&wqe->lock);
 549                 if (!work)
 550                         break;
 551                 io_assign_current_work(worker, work);
 552
 553                 /* handle a whole dependent link */
 554                 do {
 555                         struct io_wq_work *next_hashed, *linked;
 556                         unsigned int hash = io_get_work_hash(work);
 557
 558                         next_hashed = wq_next_work(work);
 559                         io_impersonate_work(worker, work);
 560                         wq->do_work(work);
 561                         io_assign_current_work(worker, NULL);
 562
 563                         linked = wq->free_work(work);
 564                         work = next_hashed;
 565                         if (!work && linked && !io_wq_is_hashed(linked)) {
 566                                 work = linked;
 567                                 linked = NULL;
 568                         }
 569                         io_assign_current_work(worker, work);
 570                         if (linked)
 571                                 io_wqe_enqueue(wqe, linked);
 572
 573                         if (hash != -1U && !next_hashed) {
 574                                 raw_spin_lock_irq(&wqe->lock);
 575                                 wqe->hash_map &= ~BIT_ULL(hash);
 576                                 wqe->flags &= ~IO_WQE_FLAG_STALLED;
 577                                 /* skip unnecessary unlock-lock wqe->lock */
 578                                 if (!work)
 579                                         goto get_next;
 580                                 raw_spin_unlock_irq(&wqe->lock);
 581                         }
 582                 } while (work);
 583
 584                 raw_spin_lock_irq(&wqe->lock);
 585         } while (1);
 586 }
 587
 588 static int io_wqe_worker(void *data)
 589 {
 590         struct io_worker *worker = data;
 591         struct io_wqe *wqe = worker->wqe;
 592         struct io_wq *wq = wqe->wq;
 593
 594         io_worker_start(wqe, worker);
 595
 596         while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 597                 set_current_state(TASK_INTERRUPTIBLE);
 598 loop:
 599                 raw_spin_lock_irq(&wqe->lock);
 600                 if (io_wqe_run_queue(wqe)) {
 601                         __set_current_state(TASK_RUNNING);
 602                         io_worker_handle_work(worker);
 603                         goto loop;
 604                 }
 605                 /* drops the lock on success, retry */
 606                 if (__io_worker_idle(wqe, worker)) {
 607                         __release(&wqe->lock);
 608                         goto loop;
 609                 }
 610                 raw_spin_unlock_irq(&wqe->lock);
 611                 if (signal_pending(current))
 612                         flush_signals(current);
 613                 if (schedule_timeout(WORKER_IDLE_TIMEOUT))
 614                         continue;
 615                 /* timed out, exit unless we're the fixed worker */
 616                 if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
 617                     !(worker->flags & IO_WORKER_F_FIXED))
 618                         break;
 619         }
 620
 621         if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 622                 raw_spin_lock_irq(&wqe->lock);
 623                 if (!wq_list_empty(&wqe->work_list))
 624                         io_worker_handle_work(worker);
 625                 else
 626                         raw_spin_unlock_irq(&wqe->lock);
 627         }
 628
 629         io_worker_exit(worker);
 630         return 0;
 631 }
 632
 633 /*
 634  * Called when a worker is scheduled in. Mark us as currently running.
 635  */
 636 void io_wq_worker_running(struct task_struct *tsk)
 637 {
 638         struct io_worker *worker = kthread_data(tsk);
 639         struct io_wqe *wqe = worker->wqe;
 640
 641         if (!(worker->flags & IO_WORKER_F_UP))
 642                 return;
 643         if (worker->flags & IO_WORKER_F_RUNNING)
 644                 return;
 645         worker->flags |= IO_WORKER_F_RUNNING;
 646         io_wqe_inc_running(wqe, worker);
 647 }
 648
 649 /*
 650  * Called when worker is going to sleep. If there are no workers currently
 651  * running and we have work pending, wake up a free one or have the manager
 652  * set one up.
 653  */
 654 void io_wq_worker_sleeping(struct task_struct *tsk)
 655 {
 656         struct io_worker *worker = kthread_data(tsk);
 657         struct io_wqe *wqe = worker->wqe;
 658
 659         if (!(worker->flags & IO_WORKER_F_UP))
 660                 return;
 661         if (!(worker->flags & IO_WORKER_F_RUNNING))
 662                 return;
 663
 664         worker->flags &= ~IO_WORKER_F_RUNNING;
 665
 666         raw_spin_lock_irq(&wqe->lock);
 667         io_wqe_dec_running(wqe, worker);
 668         raw_spin_unlock_irq(&wqe->lock);
 669 }
 670
 671 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 672 {
 673         struct io_wqe_acct *acct = &wqe->acct[index];
 674         struct io_worker *worker;
 675
 676         worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
 677         if (!worker)
 678                 return false;
 679
 680         refcount_set(&worker->ref, 1);
 681         worker->nulls_node.pprev = NULL;
 682         worker->wqe = wqe;
 683         spin_lock_init(&worker->lock);
 684
 685         worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
 686                                 "io_wqe_worker-%d/%d", index, wqe->node);
 687         if (IS_ERR(worker->task)) {
 688                 kfree(worker);
 689                 return false;
 690         }
 691         kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
 692
 693         raw_spin_lock_irq(&wqe->lock);
 694         hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 695         list_add_tail_rcu(&worker->all_list, &wqe->all_list);
 696         worker->flags |= IO_WORKER_F_FREE;
 697         if (index == IO_WQ_ACCT_BOUND)
 698                 worker->flags |= IO_WORKER_F_BOUND;
 699         if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
 700                 worker->flags |= IO_WORKER_F_FIXED;
 701         acct->nr_workers++;
 702         raw_spin_unlock_irq(&wqe->lock);
 703
 704         if (index == IO_WQ_ACCT_UNBOUND)
 705                 atomic_inc(&wq->user->processes);
 706
 707         refcount_inc(&wq->refs);
 708         wake_up_process(worker->task);
 709         return true;
 710 }
 711
 712 static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 713         __must_hold(wqe->lock)
 714 {
 715         struct io_wqe_acct *acct = &wqe->acct[index];
 716
 717         /* if we have available workers or no work, no need */
 718         if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
 719                 return false;
 720         return acct->nr_workers < acct->max_workers;
 721 }
 722
 723 /*
 724  * Iterate the passed in list and call the specific function for each
 725  * worker that isn't exiting
 726  */
 727 static bool io_wq_for_each_worker(struct io_wqe *wqe,
 728                                   bool (*func)(struct io_worker *, void *),
 729                                   void *data)
 730 {
 731         struct io_worker *worker;
 732         bool ret = false;
 733
 734         list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
 735                 if (io_worker_get(worker)) {
 736                         /* no task if node is/was offline */
 737                         if (worker->task)
 738                                 ret = func(worker, data);
 739                         io_worker_release(worker);
 740                         if (ret)
 741                                 break;
 742                 }
 743         }
 744
 745         return ret;
 746 }
 747
 748 static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 749 {
 750         wake_up_process(worker->task);
 751         return false;
 752 }
 753
 754 /*
 755  * Manager thread. Tasked with creating new workers, if we need them.
 756  */
 757 static int io_wq_manager(void *data)
 758 {
 759         struct io_wq *wq = data;
 760         int node;
 761
 762         refcount_set(&wq->refs, 1);
 763         complete(&wq->done);
 764
 765         while (!kthread_should_stop()) {
 766                 for_each_node(node) {
 767                         struct io_wqe *wqe = wq->wqes[node];
 768                         bool fork_worker[2] = { false, false };
 769
 770                         if (!node_online(node))
 771                                 continue;
 772
 773                         raw_spin_lock_irq(&wqe->lock);
 774                         if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
 775                                 fork_worker[IO_WQ_ACCT_BOUND] = true;
 776                         if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
 777                                 fork_worker[IO_WQ_ACCT_UNBOUND] = true;
 778                         raw_spin_unlock_irq(&wqe->lock);
 779                         if (fork_worker[IO_WQ_ACCT_BOUND])
 780                                 create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
 781                         if (fork_worker[IO_WQ_ACCT_UNBOUND])
 782                                 create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
 783                 }
 784                 set_current_state(TASK_INTERRUPTIBLE);
 785                 schedule_timeout(HZ);
 786         }
 787
 788         if (refcount_dec_and_test(&wq->refs)) {
 789                 complete(&wq->done);
 790                 return 0;
 791         }
 792         /* if ERROR is set and we get here, we have workers to wake */
 793         if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
 794                 rcu_read_lock();
 795                 for_each_node(node)
 796                         io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
 797                 rcu_read_unlock();
 798         }
 799         return 0;
 800 }
 801
 802 static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
 803                             struct io_wq_work *work)
 804 {
 805         bool free_worker;
 806
 807         if (!(work->flags & IO_WQ_WORK_UNBOUND))
 808                 return true;
 809         if (atomic_read(&acct->nr_running))
 810                 return true;
 811
 812         rcu_read_lock();
 813         free_worker = !hlist_nulls_empty(&wqe->free_list);
 814         rcu_read_unlock();
 815         if (free_worker)
 816                 return true;
 817
 818         if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
 819             !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
 820                 return false;
 821
 822         return true;
 823 }
 824
 825 static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 826 {
 827         struct io_wq *wq = wqe->wq;
 828
 829         do {
 830                 work->flags |= IO_WQ_WORK_CANCEL;
 831                 wq->do_work(work);
 832                 work = wq->free_work(work);
 833         } while (work);
 834 }
 835
 836 static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
 837 {
 838         unsigned int hash;
 839         struct io_wq_work *tail;
 840
 841         if (!io_wq_is_hashed(work)) {
 842 append:
 843                 wq_list_add_tail(&work->list, &wqe->work_list);
 844                 return;
 845         }
 846
 847         hash = io_get_work_hash(work);
 848         tail = wqe->hash_tail[hash];
 849         wqe->hash_tail[hash] = work;
 850         if (!tail)
 851                 goto append;
 852
 853         wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
 854 }
 855
 856 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 857 {
 858         struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
 859         int work_flags;
 860         unsigned long flags;
 861
 862         /*
 863          * Do early check to see if we need a new unbound worker, and if we do,
 864          * if we're allowed to do so. This isn't 100% accurate as there's a
 865          * gap between this check and incrementing the value, but that's OK.
 866          * It's close enough to not be an issue, fork() has the same delay.
 867          */
 868         if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
 869                 io_run_cancel(work, wqe);
 870                 return;
 871         }
 872
 873         work_flags = work->flags;
 874         raw_spin_lock_irqsave(&wqe->lock, flags);
 875         io_wqe_insert_work(wqe, work);
 876         wqe->flags &= ~IO_WQE_FLAG_STALLED;
 877         raw_spin_unlock_irqrestore(&wqe->lock, flags);
 878
 879         if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
 880             !atomic_read(&acct->nr_running))
 881                 io_wqe_wake_worker(wqe, acct);
 882 }
 883
 884 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 885 {
 886         struct io_wqe *wqe = wq->wqes[numa_node_id()];
 887
 888         io_wqe_enqueue(wqe, work);
 889 }
 890
 891 /*
 892  * Work items that hash to the same value will not be done in parallel.
 893  * Used to limit concurrent writes, generally hashed by inode.
 894  */
 895 void io_wq_hash_work(struct io_wq_work *work, void *val)
 896 {
 897         unsigned int bit;
 898
 899         bit = hash_ptr(val, IO_WQ_HASH_ORDER);
 900         work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
 901 }
 902
 903 struct io_cb_cancel_data {
 904         work_cancel_fn *fn;
 905         void *data;
 906         int nr_running;
 907         int nr_pending;
 908         bool cancel_all;
 909 };
 910
 911 static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 912 {
 913         struct io_cb_cancel_data *match = data;
 914         unsigned long flags;
 915
 916         /*
 917          * Hold the lock to avoid ->cur_work going out of scope, caller
 918          * may dereference the passed in work.
 919          */
 920         spin_lock_irqsave(&worker->lock, flags);
 921         if (worker->cur_work &&
 922             match->fn(worker->cur_work, match->data)) {
 923                 send_sig(SIGINT, worker->task, 1);
 924                 match->nr_running++;
 925         }
 926         spin_unlock_irqrestore(&worker->lock, flags);
 927
 928         return match->nr_running && !match->cancel_all;
 929 }
 930
 931 static inline void io_wqe_remove_pending(struct io_wqe *wqe,
 932                                          struct io_wq_work *work,
 933                                          struct io_wq_work_node *prev)
 934 {
 935         unsigned int hash = io_get_work_hash(work);
 936         struct io_wq_work *prev_work = NULL;
 937
 938         if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
 939                 if (prev)
 940                         prev_work = container_of(prev, struct io_wq_work, list);
 941                 if (prev_work && io_get_work_hash(prev_work) == hash)
 942                         wqe->hash_tail[hash] = prev_work;
 943                 else
 944                         wqe->hash_tail[hash] = NULL;
 945         }
 946         wq_list_del(&wqe->work_list, &work->list, prev);
 947 }
 948
 949 static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
 950                                        struct io_cb_cancel_data *match)
 951 {
 952         struct io_wq_work_node *node, *prev;
 953         struct io_wq_work *work;
 954         unsigned long flags;
 955
 956 retry:
 957         raw_spin_lock_irqsave(&wqe->lock, flags);
 958         wq_list_for_each(node, prev, &wqe->work_list) {
 959                 work = container_of(node, struct io_wq_work, list);
 960                 if (!match->fn(work, match->data))
 961                         continue;
 962                 io_wqe_remove_pending(wqe, work, prev);
 963                 raw_spin_unlock_irqrestore(&wqe->lock, flags);
 964                 io_run_cancel(work, wqe);
 965                 match->nr_pending++;
 966                 if (!match->cancel_all)
 967                         return;
 968
 969                 /* not safe to continue after unlock */
 970                 goto retry;
 971         }
 972         raw_spin_unlock_irqrestore(&wqe->lock, flags);
 973 }
 974
 975 static void io_wqe_cancel_running_work(struct io_wqe *wqe,
 976                                        struct io_cb_cancel_data *match)
 977 {
 978         rcu_read_lock();
 979         io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
 980         rcu_read_unlock();
 981 }
 982
 983 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 984                                   void *data, bool cancel_all)
 985 {
 986         struct io_cb_cancel_data match = {
 987                 .fn             = cancel,
 988                 .data           = data,
 989                 .cancel_all     = cancel_all,
 990         };
 991         int node;
 992
 993         /*
 994          * First check pending list, if we're lucky we can just remove it
 995          * from there. CANCEL_OK means that the work is returned as-new,
 996          * no completion will be posted for it.
 997          */
 998         for_each_node(node) {
 999                 struct io_wqe *wqe = wq->wqes[node];
1000
1001                 io_wqe_cancel_pending_work(wqe, &match);
1002                 if (match.nr_pending && !match.cancel_all)
1003                         return IO_WQ_CANCEL_OK;
1004         }
1005
1006         /*
1007          * Now check if a free (going busy) or busy worker has the work
1008          * currently running. If we find it there, we'll return CANCEL_RUNNING
1009          * as an indication that we attempt to signal cancellation. The
1010          * completion will run normally in this case.
1011          */
1012         for_each_node(node) {
1013                 struct io_wqe *wqe = wq->wqes[node];
1014
1015                 io_wqe_cancel_running_work(wqe, &match);
1016                 if (match.nr_running && !match.cancel_all)
1017                         return IO_WQ_CANCEL_RUNNING;
1018         }
1019
1020         if (match.nr_running)
1021                 return IO_WQ_CANCEL_RUNNING;
1022         if (match.nr_pending)
1023                 return IO_WQ_CANCEL_OK;
1024         return IO_WQ_CANCEL_NOTFOUND;
1025 }
1026
1027 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
1028 {
1029         int ret = -ENOMEM, node;
1030         struct io_wq *wq;
1031
1032         if (WARN_ON_ONCE(!data->free_work || !data->do_work))
1033                 return ERR_PTR(-EINVAL);
1034
1035         wq = kzalloc(sizeof(*wq), GFP_KERNEL);
1036         if (!wq)
1037                 return ERR_PTR(-ENOMEM);
1038
1039         wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
1040         if (!wq->wqes)
1041                 goto err_wq;
1042
1043         ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
1044         if (ret)
1045                 goto err_wqes;
1046
1047         wq->free_work = data->free_work;
1048         wq->do_work = data->do_work;
1049
1050         /* caller must already hold a reference to this */
1051         wq->user = data->user;
1052
1053         ret = -ENOMEM;
1054         for_each_node(node) {
1055                 struct io_wqe *wqe;
1056                 int alloc_node = node;
1057
1058                 if (!node_online(alloc_node))
1059                         alloc_node = NUMA_NO_NODE;
1060                 wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
1061                 if (!wqe)
1062                         goto err;
1063                 wq->wqes[node] = wqe;
1064                 wqe->node = alloc_node;
1065                 wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
1066                 atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
1067                 if (wq->user) {
1068                         wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
1069                                         task_rlimit(current, RLIMIT_NPROC);
1070                 }
1071                 atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
1072                 wqe->wq = wq;
1073                 raw_spin_lock_init(&wqe->lock);
1074                 INIT_WQ_LIST(&wqe->work_list);
1075                 INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
1076                 INIT_LIST_HEAD(&wqe->all_list);
1077         }
1078
1079         init_completion(&wq->done);
1080
1081         wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
1082         if (!IS_ERR(wq->manager)) {
1083                 wake_up_process(wq->manager);
1084                 wait_for_completion(&wq->done);
1085                 if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
1086                         ret = -ENOMEM;
1087                         goto err;
1088                 }
1089                 refcount_set(&wq->use_refs, 1);
1090                 reinit_completion(&wq->done);
1091                 return wq;
1092         }
1093
1094         ret = PTR_ERR(wq->manager);
1095         complete(&wq->done);
1096 err:
1097         cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
1098         for_each_node(node)
1099                 kfree(wq->wqes[node]);
1100 err_wqes:
1101         kfree(wq->wqes);
1102 err_wq:
1103         kfree(wq);
1104         return ERR_PTR(ret);
1105 }
1106
1107 bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
1108 {
1109         if (data->free_work != wq->free_work || data->do_work != wq->do_work)
1110                 return false;
1111
1112         return refcount_inc_not_zero(&wq->use_refs);
1113 }
1114
1115 static void __io_wq_destroy(struct io_wq *wq)
1116 {
1117         int node;
1118
1119         cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
1120
1121         set_bit(IO_WQ_BIT_EXIT, &wq->state);
1122         if (wq->manager)
1123                 kthread_stop(wq->manager);
1124
1125         rcu_read_lock();
1126         for_each_node(node)
1127                 io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
1128         rcu_read_unlock();
1129
1130         wait_for_completion(&wq->done);
1131
1132         for_each_node(node)
1133                 kfree(wq->wqes[node]);
1134         kfree(wq->wqes);
1135         kfree(wq);
1136 }
1137
1138 void io_wq_destroy(struct io_wq *wq)
1139 {
1140         if (refcount_dec_and_test(&wq->use_refs))
1141                 __io_wq_destroy(wq);
1142 }
1143
1144 static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
1145 {
1146         struct task_struct *task = worker->task;
1147         struct rq_flags rf;
1148         struct rq *rq;
1149
1150         rq = task_rq_lock(task, &rf);
1151         do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node));
1152         task->flags |= PF_NO_SETAFFINITY;
1153         task_rq_unlock(rq, task, &rf);
1154         return false;
1155 }
1156
1157 static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
1158 {
1159         struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
1160         int i;
1161
1162         rcu_read_lock();
1163         for_each_node(i)
1164                 io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
1165         rcu_read_unlock();
1166         return 0;
1167 }
1168
1169 static __init int io_wq_init(void)
1170 {
1171         int ret;
1172
1173         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
1174                                         io_wq_cpu_online, NULL);
1175         if (ret < 0)
1176                 return ret;
1177         io_wq_online = ret;
1178         return 0;
1179 }
1180 subsys_initcall(io_wq_init);